diff --git a/CompeteSMoE/competesmoe_versions/Softplus_diversity_3loss_adv_norm_competesmoe/added_tokens.json b/CompeteSMoE/competesmoe_versions/Softplus_diversity_3loss_adv_norm_competesmoe/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Softplus_diversity_3loss_adv_norm_competesmoe/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/CompeteSMoE/competesmoe_versions/Softplus_diversity_3loss_adv_norm_competesmoe/config.json b/CompeteSMoE/competesmoe_versions/Softplus_diversity_3loss_adv_norm_competesmoe/config.json new file mode 100644 index 0000000000000000000000000000000000000000..73ab92e38f84782475b94b505806c28379a3720e --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Softplus_diversity_3loss_adv_norm_competesmoe/config.json @@ -0,0 +1,181 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": true, + "dropout": false, + "embd_pdrop": 0.0, + "eos_token_id": 32000, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 8192, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_type": "llava_phi", + "moe_name": "competesmoe", + "normalization": true, + "num_attention_heads": 32, + "num_experts": 4, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 2, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "rate_compete": 0.2, + "rate_flip": 0.07, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": true, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "use_cache": true, + "use_mm_proj": true, + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/CompeteSMoE/competesmoe_versions/Softplus_diversity_3loss_adv_norm_competesmoe/generation_config.json b/CompeteSMoE/competesmoe_versions/Softplus_diversity_3loss_adv_norm_competesmoe/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Softplus_diversity_3loss_adv_norm_competesmoe/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/CompeteSMoE/competesmoe_versions/Softplus_diversity_3loss_adv_norm_competesmoe/model-00001-of-00003.safetensors b/CompeteSMoE/competesmoe_versions/Softplus_diversity_3loss_adv_norm_competesmoe/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..166082443b2f0710b51741dedd887e3e490c8852 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Softplus_diversity_3loss_adv_norm_competesmoe/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91d4a1a23a8010e97fbba7d1e86287a0fa1481eb95e1513318083a42d17d88f1 +size 4972489328 diff --git a/CompeteSMoE/competesmoe_versions/Softplus_diversity_3loss_adv_norm_competesmoe/model-00002-of-00003.safetensors b/CompeteSMoE/competesmoe_versions/Softplus_diversity_3loss_adv_norm_competesmoe/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bd158f41f7e9565abdf91a1e4ae4190789c3ccab --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Softplus_diversity_3loss_adv_norm_competesmoe/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7731b764c63de002f22b6ab47c19860da5cc43a2e051898c54d1ceb4bafd0f5 +size 4985754844 diff --git a/CompeteSMoE/competesmoe_versions/Softplus_diversity_3loss_adv_norm_competesmoe/model-00003-of-00003.safetensors b/CompeteSMoE/competesmoe_versions/Softplus_diversity_3loss_adv_norm_competesmoe/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..caccdc89d7a67ef9a755e4a5202d60671d3c0388 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Softplus_diversity_3loss_adv_norm_competesmoe/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d245f364c5c3a4fff32aadf462b2d87264c18fe59d0854c15778868a7e4d047 +size 248943552 diff --git a/CompeteSMoE/competesmoe_versions/Softplus_diversity_3loss_adv_norm_competesmoe/model.safetensors.index.json b/CompeteSMoE/competesmoe_versions/Softplus_diversity_3loss_adv_norm_competesmoe/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..a900cb68b39c8fe6eedc011196340060a750c9c8 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Softplus_diversity_3loss_adv_norm_competesmoe/model.safetensors.index.json @@ -0,0 +1,1033 @@ +{ + "metadata": { + "total_size": 10207040684 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.norm.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors" + } +} diff --git a/CompeteSMoE/competesmoe_versions/Softplus_diversity_3loss_adv_norm_competesmoe/special_tokens_map.json b/CompeteSMoE/competesmoe_versions/Softplus_diversity_3loss_adv_norm_competesmoe/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Softplus_diversity_3loss_adv_norm_competesmoe/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/CompeteSMoE/competesmoe_versions/Softplus_diversity_3loss_adv_norm_competesmoe/tokenizer.model b/CompeteSMoE/competesmoe_versions/Softplus_diversity_3loss_adv_norm_competesmoe/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Softplus_diversity_3loss_adv_norm_competesmoe/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/CompeteSMoE/competesmoe_versions/Softplus_diversity_3loss_adv_norm_competesmoe/tokenizer_config.json b/CompeteSMoE/competesmoe_versions/Softplus_diversity_3loss_adv_norm_competesmoe/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Softplus_diversity_3loss_adv_norm_competesmoe/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/CompeteSMoE/competesmoe_versions/Softplus_diversity_3loss_adv_norm_competesmoe/trainer_state.json b/CompeteSMoE/competesmoe_versions/Softplus_diversity_3loss_adv_norm_competesmoe/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1d4fb8a59b976645e30f8595c34b4088f109086c --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Softplus_diversity_3loss_adv_norm_competesmoe/trainer_state.json @@ -0,0 +1,124783 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999398785546805, + "eval_steps": 500, + "global_step": 8316, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.02250182, + "auxiliary_loss_mlp": 0.02234257, + "balance_loss_clip": 1.90925574, + "balance_loss_mlp": 1.92574382, + "epoch": 0.00012024289063909097, + "flos": 24932483919360.0, + "grad_norm": 39.930954820894954, + "language_loss": 2.58213806, + "learning_rate": 0.0, + "loss": 1.88551211, + "num_input_tokens_seen": 20375, + "step": 1, + "time_per_iteration": 15.110378980636597 + }, + { + "auxiliary_loss_clip": 0.01497569, + "auxiliary_loss_mlp": 0.01449668, + "balance_loss_clip": 1.27090144, + "balance_loss_mlp": 1.24214828, + "epoch": 0.00024048578127818193, + "flos": 30664624377600.0, + "grad_norm": 66.32231177649138, + "language_loss": 1.89202523, + "learning_rate": 5.021476677069823e-07, + "loss": 1.92149758, + "num_input_tokens_seen": 39035, + "step": 2, + "time_per_iteration": 2.654712200164795 + }, + { + "auxiliary_loss_clip": 0.01500063, + "auxiliary_loss_mlp": 0.01459375, + "balance_loss_clip": 1.27330434, + "balance_loss_mlp": 1.24613333, + "epoch": 0.0003607286719172729, + "flos": 19026227969280.0, + "grad_norm": 94.51213799538331, + "language_loss": 1.61757326, + "learning_rate": 7.958852231401551e-07, + "loss": 1.64716768, + "num_input_tokens_seen": 57600, + "step": 3, + "time_per_iteration": 2.5200393199920654 + }, + { + "auxiliary_loss_clip": 0.01501415, + "auxiliary_loss_mlp": 0.01493251, + "balance_loss_clip": 1.27449548, + "balance_loss_mlp": 1.28229797, + "epoch": 0.00048097156255636386, + "flos": 19316314206720.0, + "grad_norm": 36.420036422786474, + "language_loss": 1.64457178, + "learning_rate": 1.0042953354139647e-06, + "loss": 1.67451835, + "num_input_tokens_seen": 76465, + "step": 4, + "time_per_iteration": 2.546741247177124 + }, + { + "auxiliary_loss_clip": 0.01500116, + "auxiliary_loss_mlp": 0.01451569, + "balance_loss_clip": 1.27311373, + "balance_loss_mlp": 1.24366796, + "epoch": 0.0006012144531954548, + "flos": 13991264893440.0, + "grad_norm": 55.29840632937083, + "language_loss": 1.94002509, + "learning_rate": 1.1659507774310057e-06, + "loss": 1.96954191, + "num_input_tokens_seen": 94350, + "step": 5, + "time_per_iteration": 2.7414772510528564 + }, + { + "auxiliary_loss_clip": 0.01499066, + "auxiliary_loss_mlp": 0.01449117, + "balance_loss_clip": 1.27209818, + "balance_loss_mlp": 1.24102497, + "epoch": 0.0007214573438345458, + "flos": 23148988225920.0, + "grad_norm": 44.92756608891305, + "language_loss": 1.61336565, + "learning_rate": 1.2980328908471373e-06, + "loss": 1.64284742, + "num_input_tokens_seen": 114595, + "step": 6, + "time_per_iteration": 2.8056447505950928 + }, + { + "auxiliary_loss_clip": 0.01632423, + "auxiliary_loss_mlp": 0.01885595, + "balance_loss_clip": 1.40486896, + "balance_loss_mlp": 1.6734978, + "epoch": 0.0008417002344736367, + "flos": 67663246170240.0, + "grad_norm": 4.568354970047484, + "language_loss": 0.81552339, + "learning_rate": 1.4097067265369432e-06, + "loss": 0.8507036, + "num_input_tokens_seen": 179590, + "step": 7, + "time_per_iteration": 3.2105791568756104 + }, + { + "auxiliary_loss_clip": 0.01498923, + "auxiliary_loss_mlp": 0.01457513, + "balance_loss_clip": 1.27177882, + "balance_loss_mlp": 1.24694192, + "epoch": 0.0009619431251127277, + "flos": 21281381504640.0, + "grad_norm": 41.71539685529381, + "language_loss": 1.59074938, + "learning_rate": 1.506443003120947e-06, + "loss": 1.62031364, + "num_input_tokens_seen": 195090, + "step": 8, + "time_per_iteration": 2.782014846801758 + }, + { + "auxiliary_loss_clip": 0.0150184, + "auxiliary_loss_mlp": 0.01446374, + "balance_loss_clip": 1.27431464, + "balance_loss_mlp": 1.23818648, + "epoch": 0.0010821860157518186, + "flos": 23331342597120.0, + "grad_norm": 18.105308431406616, + "language_loss": 1.47944713, + "learning_rate": 1.5917704462803102e-06, + "loss": 1.50892949, + "num_input_tokens_seen": 211635, + "step": 9, + "time_per_iteration": 2.7259347438812256 + }, + { + "auxiliary_loss_clip": 0.01503469, + "auxiliary_loss_mlp": 0.01475986, + "balance_loss_clip": 1.27593446, + "balance_loss_mlp": 1.26665449, + "epoch": 0.0012024289063909096, + "flos": 17010166337280.0, + "grad_norm": 13.69685678323049, + "language_loss": 1.53215623, + "learning_rate": 1.6680984451379884e-06, + "loss": 1.56195092, + "num_input_tokens_seen": 224705, + "step": 10, + "time_per_iteration": 2.8456339836120605 + }, + { + "auxiliary_loss_clip": 0.01503957, + "auxiliary_loss_mlp": 0.01465481, + "balance_loss_clip": 1.27646899, + "balance_loss_mlp": 1.25777054, + "epoch": 0.0013226717970300007, + "flos": 21288133261440.0, + "grad_norm": 13.827191500282257, + "language_loss": 1.32726765, + "learning_rate": 1.7371455188905097e-06, + "loss": 1.35696197, + "num_input_tokens_seen": 244635, + "step": 11, + "time_per_iteration": 2.8355727195739746 + }, + { + "auxiliary_loss_clip": 0.01498694, + "auxiliary_loss_mlp": 0.01458105, + "balance_loss_clip": 1.27195644, + "balance_loss_mlp": 1.25115812, + "epoch": 0.0014429146876690916, + "flos": 27237884935680.0, + "grad_norm": 10.585159220709775, + "language_loss": 1.25395691, + "learning_rate": 1.8001805585541196e-06, + "loss": 1.28352499, + "num_input_tokens_seen": 265765, + "step": 12, + "time_per_iteration": 2.8393161296844482 + }, + { + "auxiliary_loss_clip": 0.01503103, + "auxiliary_loss_mlp": 0.01454593, + "balance_loss_clip": 1.27583575, + "balance_loss_mlp": 1.24306786, + "epoch": 0.0015631575783081825, + "flos": 19062174504960.0, + "grad_norm": 6.747662674478728, + "language_loss": 1.29682612, + "learning_rate": 1.8581671739548328e-06, + "loss": 1.32640314, + "num_input_tokens_seen": 283500, + "step": 13, + "time_per_iteration": 2.7749040126800537 + }, + { + "auxiliary_loss_clip": 0.0149923, + "auxiliary_loss_mlp": 0.01422225, + "balance_loss_clip": 1.27230823, + "balance_loss_mlp": 1.21508729, + "epoch": 0.0016834004689472734, + "flos": 48139473985920.0, + "grad_norm": 6.281820683104416, + "language_loss": 1.13746762, + "learning_rate": 1.9118543942439254e-06, + "loss": 1.16668212, + "num_input_tokens_seen": 305685, + "step": 14, + "time_per_iteration": 3.9973580837249756 + }, + { + "auxiliary_loss_clip": 0.01503628, + "auxiliary_loss_mlp": 0.0145034, + "balance_loss_clip": 1.27582479, + "balance_loss_mlp": 1.24091363, + "epoch": 0.0018036433595863645, + "flos": 34970026314240.0, + "grad_norm": 5.543357641154431, + "language_loss": 1.12830865, + "learning_rate": 1.961836000571161e-06, + "loss": 1.15784836, + "num_input_tokens_seen": 327340, + "step": 15, + "time_per_iteration": 2.9357988834381104 + }, + { + "auxiliary_loss_clip": 0.01628933, + "auxiliary_loss_mlp": 0.0179886, + "balance_loss_clip": 1.40147853, + "balance_loss_mlp": 1.5928663, + "epoch": 0.0019238862502254555, + "flos": 59768284440960.0, + "grad_norm": 3.779523602579477, + "language_loss": 0.64637804, + "learning_rate": 2.0085906708279293e-06, + "loss": 0.68065602, + "num_input_tokens_seen": 382710, + "step": 16, + "time_per_iteration": 3.2454710006713867 + }, + { + "auxiliary_loss_clip": 0.01500924, + "auxiliary_loss_mlp": 0.01458279, + "balance_loss_clip": 1.27383161, + "balance_loss_mlp": 1.25180888, + "epoch": 0.0020441291408645466, + "flos": 20814543417600.0, + "grad_norm": 4.915095722472916, + "language_loss": 1.16140723, + "learning_rate": 2.0525099325728135e-06, + "loss": 1.19099939, + "num_input_tokens_seen": 400890, + "step": 17, + "time_per_iteration": 2.81480073928833 + }, + { + "auxiliary_loss_clip": 0.01628042, + "auxiliary_loss_mlp": 0.01780977, + "balance_loss_clip": 1.40057302, + "balance_loss_mlp": 1.57650971, + "epoch": 0.0021643720315036373, + "flos": 63857001582720.0, + "grad_norm": 3.532968486197211, + "language_loss": 0.72134924, + "learning_rate": 2.0939181139872922e-06, + "loss": 0.7554394, + "num_input_tokens_seen": 462605, + "step": 18, + "time_per_iteration": 3.25620174407959 + }, + { + "auxiliary_loss_clip": 0.01498313, + "auxiliary_loss_mlp": 0.01468396, + "balance_loss_clip": 1.27142096, + "balance_loss_mlp": 1.26297498, + "epoch": 0.0022846149221427284, + "flos": 31284981192960.0, + "grad_norm": 4.792802822793903, + "language_loss": 1.01536536, + "learning_rate": 2.1330868934640175e-06, + "loss": 1.04503238, + "num_input_tokens_seen": 483280, + "step": 19, + "time_per_iteration": 2.837735414505005 + }, + { + "auxiliary_loss_clip": 0.01625943, + "auxiliary_loss_mlp": 0.0174634, + "balance_loss_clip": 1.39879918, + "balance_loss_mlp": 1.54492462, + "epoch": 0.002404857812781819, + "flos": 51083648161920.0, + "grad_norm": 3.5651481818252657, + "language_loss": 0.76470482, + "learning_rate": 2.170246112844971e-06, + "loss": 0.79842758, + "num_input_tokens_seen": 537620, + "step": 20, + "time_per_iteration": 3.087188720703125 + }, + { + "auxiliary_loss_clip": 0.01498097, + "auxiliary_loss_mlp": 0.01428594, + "balance_loss_clip": 1.27116156, + "balance_loss_mlp": 1.22507989, + "epoch": 0.0025251007034209102, + "flos": 15815347309440.0, + "grad_norm": 5.281921156485926, + "language_loss": 1.01582479, + "learning_rate": 2.2055919496770983e-06, + "loss": 1.04509163, + "num_input_tokens_seen": 555760, + "step": 21, + "time_per_iteration": 2.833425521850586 + }, + { + "auxiliary_loss_clip": 0.01494982, + "auxiliary_loss_mlp": 0.01414907, + "balance_loss_clip": 1.26823711, + "balance_loss_mlp": 1.21177411, + "epoch": 0.0026453435940600014, + "flos": 37851857458560.0, + "grad_norm": 5.065062361523867, + "language_loss": 0.89615041, + "learning_rate": 2.2392931865974923e-06, + "loss": 0.92524934, + "num_input_tokens_seen": 578450, + "step": 22, + "time_per_iteration": 2.8427734375 + }, + { + "auxiliary_loss_clip": 0.01493908, + "auxiliary_loss_mlp": 0.01421468, + "balance_loss_clip": 1.26689816, + "balance_loss_mlp": 1.21814466, + "epoch": 0.002765586484699092, + "flos": 21141976821120.0, + "grad_norm": 6.105624987877641, + "language_loss": 1.01829243, + "learning_rate": 2.271496085962064e-06, + "loss": 1.04744613, + "num_input_tokens_seen": 596145, + "step": 23, + "time_per_iteration": 2.7736740112304688 + }, + { + "auxiliary_loss_clip": 0.01490125, + "auxiliary_loss_mlp": 0.01418386, + "balance_loss_clip": 1.26353407, + "balance_loss_mlp": 1.21334624, + "epoch": 0.002885829375338183, + "flos": 20667381396480.0, + "grad_norm": 3.1457892956736115, + "language_loss": 1.02805912, + "learning_rate": 2.3023282262611022e-06, + "loss": 1.05714417, + "num_input_tokens_seen": 614920, + "step": 24, + "time_per_iteration": 2.7816321849823 + }, + { + "auxiliary_loss_clip": 0.01491882, + "auxiliary_loss_mlp": 0.0142094, + "balance_loss_clip": 1.26496291, + "balance_loss_mlp": 1.21961927, + "epoch": 0.003006072265977274, + "flos": 34823869873920.0, + "grad_norm": 3.3888587719085796, + "language_loss": 0.92550582, + "learning_rate": 2.3319015548620114e-06, + "loss": 0.95463401, + "num_input_tokens_seen": 636060, + "step": 25, + "time_per_iteration": 2.8838248252868652 + }, + { + "auxiliary_loss_clip": 0.01489312, + "auxiliary_loss_mlp": 0.01396759, + "balance_loss_clip": 1.26238203, + "balance_loss_mlp": 1.1997298, + "epoch": 0.003126315156616365, + "flos": 24422021118720.0, + "grad_norm": 2.2741994150142704, + "language_loss": 0.93003589, + "learning_rate": 2.3603148416618152e-06, + "loss": 0.95889664, + "num_input_tokens_seen": 655575, + "step": 26, + "time_per_iteration": 2.7989790439605713 + }, + { + "auxiliary_loss_clip": 0.01488619, + "auxiliary_loss_mlp": 0.0139863, + "balance_loss_clip": 1.26215136, + "balance_loss_mlp": 1.19826245, + "epoch": 0.003246558047255456, + "flos": 23622326674560.0, + "grad_norm": 2.453710776218704, + "language_loss": 1.0087353, + "learning_rate": 2.3876556694204647e-06, + "loss": 1.03760779, + "num_input_tokens_seen": 675730, + "step": 27, + "time_per_iteration": 2.8003287315368652 + }, + { + "auxiliary_loss_clip": 0.01489241, + "auxiliary_loss_mlp": 0.01398477, + "balance_loss_clip": 1.26298189, + "balance_loss_mlp": 1.19629824, + "epoch": 0.003366800937894547, + "flos": 17820275725440.0, + "grad_norm": 3.5239991247461173, + "language_loss": 0.90737122, + "learning_rate": 2.414002061950908e-06, + "loss": 0.93624836, + "num_input_tokens_seen": 694605, + "step": 28, + "time_per_iteration": 2.7522695064544678 + }, + { + "auxiliary_loss_clip": 0.01485398, + "auxiliary_loss_mlp": 0.01411446, + "balance_loss_clip": 1.25875235, + "balance_loss_mlp": 1.20917201, + "epoch": 0.003487043828533638, + "flos": 24426115269120.0, + "grad_norm": 2.341767959267583, + "language_loss": 0.99779767, + "learning_rate": 2.4394238264681557e-06, + "loss": 1.02676606, + "num_input_tokens_seen": 714340, + "step": 29, + "time_per_iteration": 2.777336359024048 + }, + { + "auxiliary_loss_clip": 0.01485056, + "auxiliary_loss_mlp": 0.01399438, + "balance_loss_clip": 1.25890291, + "balance_loss_mlp": 1.20078754, + "epoch": 0.003607286719172729, + "flos": 26140311002880.0, + "grad_norm": 2.1096074457482272, + "language_loss": 0.99555767, + "learning_rate": 2.4639836682781433e-06, + "loss": 1.02440262, + "num_input_tokens_seen": 734470, + "step": 30, + "time_per_iteration": 2.8165183067321777 + }, + { + "auxiliary_loss_clip": 0.01493, + "auxiliary_loss_mlp": 0.01394756, + "balance_loss_clip": 1.26621091, + "balance_loss_mlp": 1.19572449, + "epoch": 0.00372752960981182, + "flos": 20593082113920.0, + "grad_norm": 2.6417133987169623, + "language_loss": 1.00300574, + "learning_rate": 2.487738122623307e-06, + "loss": 1.03188324, + "num_input_tokens_seen": 753380, + "step": 31, + "time_per_iteration": 2.7421090602874756 + }, + { + "auxiliary_loss_clip": 0.01485783, + "auxiliary_loss_mlp": 0.0140293, + "balance_loss_clip": 1.25908947, + "balance_loss_mlp": 1.20695031, + "epoch": 0.003847772500450911, + "flos": 22674608282880.0, + "grad_norm": 3.5301675802177384, + "language_loss": 0.98918223, + "learning_rate": 2.510738338534912e-06, + "loss": 1.01806927, + "num_input_tokens_seen": 772105, + "step": 32, + "time_per_iteration": 2.811126708984375 + }, + { + "auxiliary_loss_clip": 0.01477731, + "auxiliary_loss_mlp": 0.0137896, + "balance_loss_clip": 1.25108659, + "balance_loss_mlp": 1.1840291, + "epoch": 0.003968015391090002, + "flos": 17967796882560.0, + "grad_norm": 2.3552133188045627, + "language_loss": 1.02633166, + "learning_rate": 2.5330307420306648e-06, + "loss": 1.05489862, + "num_input_tokens_seen": 788955, + "step": 33, + "time_per_iteration": 2.6991426944732666 + }, + { + "auxiliary_loss_clip": 0.01478068, + "auxiliary_loss_mlp": 0.01371749, + "balance_loss_clip": 1.25164628, + "balance_loss_mlp": 1.17815292, + "epoch": 0.004088258281729093, + "flos": 27304103658240.0, + "grad_norm": 4.104891642987217, + "language_loss": 0.87997502, + "learning_rate": 2.554657600279796e-06, + "loss": 0.90847313, + "num_input_tokens_seen": 810230, + "step": 34, + "time_per_iteration": 2.8045008182525635 + }, + { + "auxiliary_loss_clip": 0.0147614, + "auxiliary_loss_mlp": 0.01350554, + "balance_loss_clip": 1.24951029, + "balance_loss_mlp": 1.15142632, + "epoch": 0.004208501172368184, + "flos": 23258587599360.0, + "grad_norm": 2.084639840939523, + "language_loss": 1.03520417, + "learning_rate": 2.5756575039679493e-06, + "loss": 1.06347108, + "num_input_tokens_seen": 829780, + "step": 35, + "time_per_iteration": 2.80822491645813 + }, + { + "auxiliary_loss_clip": 0.01467582, + "auxiliary_loss_mlp": 0.01372224, + "balance_loss_clip": 1.24106622, + "balance_loss_mlp": 1.17767429, + "epoch": 0.0043287440630072746, + "flos": 17312104062720.0, + "grad_norm": 1.9351530254250338, + "language_loss": 0.95189619, + "learning_rate": 2.5960657816942747e-06, + "loss": 0.98029423, + "num_input_tokens_seen": 848695, + "step": 36, + "time_per_iteration": 2.7961692810058594 + }, + { + "auxiliary_loss_clip": 0.01590989, + "auxiliary_loss_mlp": 0.0148532, + "balance_loss_clip": 1.36474872, + "balance_loss_mlp": 1.3136586, + "epoch": 0.004448986953646365, + "flos": 53092491160320.0, + "grad_norm": 1.3445061019139684, + "language_loss": 0.60964584, + "learning_rate": 2.6159148575788668e-06, + "loss": 0.64040887, + "num_input_tokens_seen": 906730, + "step": 37, + "time_per_iteration": 3.2845959663391113 + }, + { + "auxiliary_loss_clip": 0.01471636, + "auxiliary_loss_mlp": 0.01354047, + "balance_loss_clip": 1.24527955, + "balance_loss_mlp": 1.16197717, + "epoch": 0.004569229844285457, + "flos": 13444165866240.0, + "grad_norm": 2.3448797531700882, + "language_loss": 0.98720348, + "learning_rate": 2.635234561171e-06, + "loss": 1.01546025, + "num_input_tokens_seen": 925125, + "step": 38, + "time_per_iteration": 2.7415685653686523 + }, + { + "auxiliary_loss_clip": 0.01469087, + "auxiliary_loss_mlp": 0.01342883, + "balance_loss_clip": 1.24287009, + "balance_loss_mlp": 1.15148056, + "epoch": 0.0046894727349245475, + "flos": 16209609966720.0, + "grad_norm": 2.3569181439815274, + "language_loss": 0.94081843, + "learning_rate": 2.6540523970949877e-06, + "loss": 0.96893811, + "num_input_tokens_seen": 939970, + "step": 39, + "time_per_iteration": 3.7693653106689453 + }, + { + "auxiliary_loss_clip": 0.01470264, + "auxiliary_loss_mlp": 0.01332663, + "balance_loss_clip": 1.24466169, + "balance_loss_mlp": 1.14936686, + "epoch": 0.004809715625563638, + "flos": 23914244505600.0, + "grad_norm": 3.630847155708061, + "language_loss": 0.92461479, + "learning_rate": 2.6723937805519533e-06, + "loss": 0.95264411, + "num_input_tokens_seen": 957470, + "step": 40, + "time_per_iteration": 4.628031969070435 + }, + { + "auxiliary_loss_clip": 0.01466451, + "auxiliary_loss_mlp": 0.01328265, + "balance_loss_clip": 1.24026752, + "balance_loss_mlp": 1.14411056, + "epoch": 0.00492995851620273, + "flos": 20773030273920.0, + "grad_norm": 2.2342092225141013, + "language_loss": 0.92927611, + "learning_rate": 2.690282243737839e-06, + "loss": 0.9572233, + "num_input_tokens_seen": 976405, + "step": 41, + "time_per_iteration": 2.8305296897888184 + }, + { + "auxiliary_loss_clip": 0.01463016, + "auxiliary_loss_mlp": 0.01317831, + "balance_loss_clip": 1.23709428, + "balance_loss_mlp": 1.13157868, + "epoch": 0.0050502014068418205, + "flos": 20338655103360.0, + "grad_norm": 10.981964747205637, + "language_loss": 0.99324274, + "learning_rate": 2.7077396173840807e-06, + "loss": 1.02105117, + "num_input_tokens_seen": 994690, + "step": 42, + "time_per_iteration": 2.7842111587524414 + }, + { + "auxiliary_loss_clip": 0.01461331, + "auxiliary_loss_mlp": 0.01331565, + "balance_loss_clip": 1.2354672, + "balance_loss_mlp": 1.14597964, + "epoch": 0.005170444297480911, + "flos": 25994872834560.0, + "grad_norm": 2.196527370828916, + "language_loss": 0.92796052, + "learning_rate": 2.7247861909342594e-06, + "loss": 0.95588946, + "num_input_tokens_seen": 1015615, + "step": 43, + "time_per_iteration": 2.85964035987854 + }, + { + "auxiliary_loss_clip": 0.01462463, + "auxiliary_loss_mlp": 0.01325707, + "balance_loss_clip": 1.23630476, + "balance_loss_mlp": 1.13916779, + "epoch": 0.005290687188120003, + "flos": 20954055841920.0, + "grad_norm": 2.3909839374874244, + "language_loss": 0.83098131, + "learning_rate": 2.7414408543044743e-06, + "loss": 0.858863, + "num_input_tokens_seen": 1031255, + "step": 44, + "time_per_iteration": 2.7484219074249268 + }, + { + "auxiliary_loss_clip": 0.01457382, + "auxiliary_loss_mlp": 0.01286789, + "balance_loss_clip": 1.23196566, + "balance_loss_mlp": 1.10454178, + "epoch": 0.005410930078759093, + "flos": 15851401585920.0, + "grad_norm": 5.359017159503153, + "language_loss": 0.79438448, + "learning_rate": 2.7577212237113157e-06, + "loss": 0.82182616, + "num_input_tokens_seen": 1048295, + "step": 45, + "time_per_iteration": 2.809617042541504 + }, + { + "auxiliary_loss_clip": 0.01462164, + "auxiliary_loss_mlp": 0.01302126, + "balance_loss_clip": 1.23667097, + "balance_loss_mlp": 1.1205461, + "epoch": 0.005531172969398184, + "flos": 21104988791040.0, + "grad_norm": 2.0410092330961356, + "language_loss": 1.04289484, + "learning_rate": 2.7736437536690466e-06, + "loss": 1.07053757, + "num_input_tokens_seen": 1067925, + "step": 46, + "time_per_iteration": 2.798321008682251 + }, + { + "auxiliary_loss_clip": 0.01456674, + "auxiliary_loss_mlp": 0.01298483, + "balance_loss_clip": 1.23137593, + "balance_loss_mlp": 1.12119436, + "epoch": 0.005651415860037276, + "flos": 20844887431680.0, + "grad_norm": 2.1899029513065487, + "language_loss": 1.07862842, + "learning_rate": 2.789223836941131e-06, + "loss": 1.10618007, + "num_input_tokens_seen": 1088060, + "step": 47, + "time_per_iteration": 2.852898120880127 + }, + { + "auxiliary_loss_clip": 0.01456404, + "auxiliary_loss_mlp": 0.0129994, + "balance_loss_clip": 1.23135996, + "balance_loss_mlp": 1.11721611, + "epoch": 0.005771658750676366, + "flos": 13260195383040.0, + "grad_norm": 2.3761804985538575, + "language_loss": 1.0876224, + "learning_rate": 2.8044758939680847e-06, + "loss": 1.11518586, + "num_input_tokens_seen": 1104130, + "step": 48, + "time_per_iteration": 2.8627729415893555 + }, + { + "auxiliary_loss_clip": 0.01458082, + "auxiliary_loss_mlp": 0.01293337, + "balance_loss_clip": 1.23351836, + "balance_loss_mlp": 1.11862433, + "epoch": 0.005891901641315457, + "flos": 24425396997120.0, + "grad_norm": 2.717887282326561, + "language_loss": 1.02221704, + "learning_rate": 2.8194134530738863e-06, + "loss": 1.04973125, + "num_input_tokens_seen": 1122900, + "step": 49, + "time_per_iteration": 2.8428900241851807 + }, + { + "auxiliary_loss_clip": 0.01457357, + "auxiliary_loss_mlp": 0.01291902, + "balance_loss_clip": 1.23262, + "balance_loss_mlp": 1.11823761, + "epoch": 0.006012144531954548, + "flos": 23076197314560.0, + "grad_norm": 3.1634362166015646, + "language_loss": 0.90188348, + "learning_rate": 2.834049222568994e-06, + "loss": 0.92937607, + "num_input_tokens_seen": 1140250, + "step": 50, + "time_per_iteration": 2.9086480140686035 + }, + { + "auxiliary_loss_clip": 0.01453497, + "auxiliary_loss_mlp": 0.01278962, + "balance_loss_clip": 1.22828507, + "balance_loss_mlp": 1.10472608, + "epoch": 0.006132387422593639, + "flos": 22528775064960.0, + "grad_norm": 1.9798034160479587, + "language_loss": 0.92548609, + "learning_rate": 2.848395155712969e-06, + "loss": 0.95281065, + "num_input_tokens_seen": 1160470, + "step": 51, + "time_per_iteration": 2.8562369346618652 + }, + { + "auxiliary_loss_clip": 0.01457998, + "auxiliary_loss_mlp": 0.01267616, + "balance_loss_clip": 1.23325562, + "balance_loss_mlp": 1.09757578, + "epoch": 0.00625263031323273, + "flos": 27628340751360.0, + "grad_norm": 2.070954412845351, + "language_loss": 0.97650969, + "learning_rate": 2.8624625093687977e-06, + "loss": 1.00376582, + "num_input_tokens_seen": 1177605, + "step": 52, + "time_per_iteration": 2.873267650604248 + }, + { + "auxiliary_loss_clip": 0.01453636, + "auxiliary_loss_mlp": 0.01284572, + "balance_loss_clip": 1.22846508, + "balance_loss_mlp": 1.11529458, + "epoch": 0.006372873203871821, + "flos": 23110671392640.0, + "grad_norm": 2.3910538188534147, + "language_loss": 0.89046532, + "learning_rate": 2.876261897070029e-06, + "loss": 0.91784739, + "num_input_tokens_seen": 1197735, + "step": 53, + "time_per_iteration": 2.812814235687256 + }, + { + "auxiliary_loss_clip": 0.01453273, + "auxiliary_loss_mlp": 0.01272317, + "balance_loss_clip": 1.22817564, + "balance_loss_mlp": 1.10227656, + "epoch": 0.006493116094510912, + "flos": 22856028900480.0, + "grad_norm": 2.3219995044944257, + "language_loss": 0.92378414, + "learning_rate": 2.889803337127447e-06, + "loss": 0.95104003, + "num_input_tokens_seen": 1216335, + "step": 54, + "time_per_iteration": 2.8055026531219482 + }, + { + "auxiliary_loss_clip": 0.014524, + "auxiliary_loss_mlp": 0.0125892, + "balance_loss_clip": 1.22784114, + "balance_loss_mlp": 1.09002447, + "epoch": 0.006613358985150003, + "flos": 23071708114560.0, + "grad_norm": 2.32898070729037, + "language_loss": 0.84409016, + "learning_rate": 2.903096296321516e-06, + "loss": 0.8712033, + "num_input_tokens_seen": 1234480, + "step": 55, + "time_per_iteration": 2.8563733100891113 + }, + { + "auxiliary_loss_clip": 0.01449105, + "auxiliary_loss_mlp": 0.01241174, + "balance_loss_clip": 1.2251153, + "balance_loss_mlp": 1.07866764, + "epoch": 0.006733601875789094, + "flos": 26537662229760.0, + "grad_norm": 2.157775903735447, + "language_loss": 0.91652548, + "learning_rate": 2.9161497296578907e-06, + "loss": 0.94342828, + "num_input_tokens_seen": 1253870, + "step": 56, + "time_per_iteration": 2.8755834102630615 + }, + { + "auxiliary_loss_clip": 0.01450821, + "auxiliary_loss_mlp": 0.01250144, + "balance_loss_clip": 1.22551727, + "balance_loss_mlp": 1.08592176, + "epoch": 0.006853844766428185, + "flos": 15523178083200.0, + "grad_norm": 2.4023335967968653, + "language_loss": 0.85979295, + "learning_rate": 2.928972116604173e-06, + "loss": 0.88680255, + "num_input_tokens_seen": 1270145, + "step": 57, + "time_per_iteration": 2.81388783454895 + }, + { + "auxiliary_loss_clip": 0.01449323, + "auxiliary_loss_mlp": 0.01236311, + "balance_loss_clip": 1.224828, + "balance_loss_mlp": 1.07494962, + "epoch": 0.006974087657067276, + "flos": 24243760897920.0, + "grad_norm": 1.9643341954275118, + "language_loss": 1.02131057, + "learning_rate": 2.9415714941751377e-06, + "loss": 1.04816687, + "num_input_tokens_seen": 1291365, + "step": 58, + "time_per_iteration": 2.813157558441162 + }, + { + "auxiliary_loss_clip": 0.01446601, + "auxiliary_loss_mlp": 0.01254493, + "balance_loss_clip": 1.22184849, + "balance_loss_mlp": 1.09351301, + "epoch": 0.007094330547706367, + "flos": 25772513690880.0, + "grad_norm": 2.428867971770582, + "language_loss": 0.93549705, + "learning_rate": 2.9539554871897396e-06, + "loss": 0.96250802, + "num_input_tokens_seen": 1311535, + "step": 59, + "time_per_iteration": 2.8753137588500977 + }, + { + "auxiliary_loss_clip": 0.0144779, + "auxiliary_loss_mlp": 0.01231969, + "balance_loss_clip": 1.22315359, + "balance_loss_mlp": 1.07432652, + "epoch": 0.007214573438345458, + "flos": 21319015979520.0, + "grad_norm": 1.9894063422949002, + "language_loss": 0.97415, + "learning_rate": 2.9661313359851253e-06, + "loss": 1.00094759, + "num_input_tokens_seen": 1329420, + "step": 60, + "time_per_iteration": 2.783799648284912 + }, + { + "auxiliary_loss_clip": 0.01444779, + "auxiliary_loss_mlp": 0.01240485, + "balance_loss_clip": 1.2207979, + "balance_loss_mlp": 1.08293784, + "epoch": 0.007334816328984549, + "flos": 24937088192640.0, + "grad_norm": 1.9573273635940276, + "language_loss": 0.93822134, + "learning_rate": 2.978105921839922e-06, + "loss": 0.96507394, + "num_input_tokens_seen": 1349965, + "step": 61, + "time_per_iteration": 2.8611881732940674 + }, + { + "auxiliary_loss_clip": 0.01447549, + "auxiliary_loss_mlp": 0.01221721, + "balance_loss_clip": 1.22310114, + "balance_loss_mlp": 1.06827426, + "epoch": 0.00745505921962364, + "flos": 18510586277760.0, + "grad_norm": 2.5531563368472128, + "language_loss": 0.71974087, + "learning_rate": 2.9898857903302893e-06, + "loss": 0.74643356, + "num_input_tokens_seen": 1368915, + "step": 62, + "time_per_iteration": 2.837646722793579 + }, + { + "auxiliary_loss_clip": 0.01445028, + "auxiliary_loss_mlp": 0.01227371, + "balance_loss_clip": 1.22123599, + "balance_loss_mlp": 1.07344782, + "epoch": 0.007575302110262731, + "flos": 18477656484480.0, + "grad_norm": 2.8082808271419397, + "language_loss": 0.87726307, + "learning_rate": 3.001477172817253e-06, + "loss": 0.90398705, + "num_input_tokens_seen": 1386805, + "step": 63, + "time_per_iteration": 2.7603037357330322 + }, + { + "auxiliary_loss_clip": 0.01444223, + "auxiliary_loss_mlp": 0.01200805, + "balance_loss_clip": 1.22031832, + "balance_loss_mlp": 1.05098295, + "epoch": 0.007695545000901822, + "flos": 24973178382720.0, + "grad_norm": 2.834013851651584, + "language_loss": 0.96177852, + "learning_rate": 3.012886006241894e-06, + "loss": 0.9882288, + "num_input_tokens_seen": 1406190, + "step": 64, + "time_per_iteration": 2.88063645362854 + }, + { + "auxiliary_loss_clip": 0.01442676, + "auxiliary_loss_mlp": 0.01220977, + "balance_loss_clip": 1.21841788, + "balance_loss_mlp": 1.06657672, + "epoch": 0.007815787891540913, + "flos": 21324223451520.0, + "grad_norm": 1.9704564118896029, + "language_loss": 0.88225728, + "learning_rate": 3.0241179513858383e-06, + "loss": 0.90889382, + "num_input_tokens_seen": 1425500, + "step": 65, + "time_per_iteration": 4.714651584625244 + }, + { + "auxiliary_loss_clip": 0.01439245, + "auxiliary_loss_mlp": 0.01213133, + "balance_loss_clip": 1.21516347, + "balance_loss_mlp": 1.06207132, + "epoch": 0.007936030782180003, + "flos": 21575777374080.0, + "grad_norm": 2.2503880118868436, + "language_loss": 0.8764888, + "learning_rate": 3.035178409737647e-06, + "loss": 0.90301251, + "num_input_tokens_seen": 1442950, + "step": 66, + "time_per_iteration": 3.7866196632385254 + }, + { + "auxiliary_loss_clip": 0.01434825, + "auxiliary_loss_mlp": 0.012062, + "balance_loss_clip": 1.21057391, + "balance_loss_mlp": 1.05742681, + "epoch": 0.008056273672819095, + "flos": 20120785159680.0, + "grad_norm": 2.329690729409727, + "language_loss": 0.88650221, + "learning_rate": 3.046072539090907e-06, + "loss": 0.91291249, + "num_input_tokens_seen": 1460915, + "step": 67, + "time_per_iteration": 2.8819077014923096 + }, + { + "auxiliary_loss_clip": 0.01433893, + "auxiliary_loss_mlp": 0.0121273, + "balance_loss_clip": 1.209764, + "balance_loss_mlp": 1.06395733, + "epoch": 0.008176516563458186, + "flos": 18333116156160.0, + "grad_norm": 2.133277520541835, + "language_loss": 1.04506445, + "learning_rate": 3.056805267986779e-06, + "loss": 1.07153082, + "num_input_tokens_seen": 1478385, + "step": 68, + "time_per_iteration": 2.7532601356506348 + }, + { + "auxiliary_loss_clip": 0.01433178, + "auxiliary_loss_mlp": 0.01195379, + "balance_loss_clip": 1.20917082, + "balance_loss_mlp": 1.05127907, + "epoch": 0.008296759454097276, + "flos": 21872076664320.0, + "grad_norm": 2.4835516905239734, + "language_loss": 0.9518162, + "learning_rate": 3.0673813091022194e-06, + "loss": 0.97810179, + "num_input_tokens_seen": 1497605, + "step": 69, + "time_per_iteration": 2.8092763423919678 + }, + { + "auxiliary_loss_clip": 0.01523253, + "auxiliary_loss_mlp": 0.01194694, + "balance_loss_clip": 1.30044651, + "balance_loss_mlp": 1.08711946, + "epoch": 0.008417002344736368, + "flos": 63408228036480.0, + "grad_norm": 1.2953997525990115, + "language_loss": 0.62000132, + "learning_rate": 3.0778051716749317e-06, + "loss": 0.64718074, + "num_input_tokens_seen": 1561150, + "step": 70, + "time_per_iteration": 3.3984262943267822 + }, + { + "auxiliary_loss_clip": 0.01427113, + "auxiliary_loss_mlp": 0.01203259, + "balance_loss_clip": 1.20349193, + "balance_loss_mlp": 1.06173396, + "epoch": 0.008537245235375458, + "flos": 22966454286720.0, + "grad_norm": 1.9821516851651033, + "language_loss": 0.90385872, + "learning_rate": 3.0880811730470094e-06, + "loss": 0.93016249, + "num_input_tokens_seen": 1580605, + "step": 71, + "time_per_iteration": 2.80169415473938 + }, + { + "auxiliary_loss_clip": 0.01515891, + "auxiliary_loss_mlp": 0.01168041, + "balance_loss_clip": 1.29342413, + "balance_loss_mlp": 1.0635184, + "epoch": 0.008657488126014549, + "flos": 61984046712960.0, + "grad_norm": 1.1350437557247917, + "language_loss": 0.5859735, + "learning_rate": 3.098213449401257e-06, + "loss": 0.61281288, + "num_input_tokens_seen": 1647535, + "step": 72, + "time_per_iteration": 3.321467876434326 + }, + { + "auxiliary_loss_clip": 0.01423261, + "auxiliary_loss_mlp": 0.01192821, + "balance_loss_clip": 1.19941115, + "balance_loss_mlp": 1.0523448, + "epoch": 0.00877773101665364, + "flos": 30296791152000.0, + "grad_norm": 2.357067984961614, + "language_loss": 0.98828399, + "learning_rate": 3.1082059657570015e-06, + "loss": 1.01444483, + "num_input_tokens_seen": 1666770, + "step": 73, + "time_per_iteration": 2.9457526206970215 + }, + { + "auxiliary_loss_clip": 0.01421539, + "auxiliary_loss_mlp": 0.01187551, + "balance_loss_clip": 1.1978091, + "balance_loss_mlp": 1.05012655, + "epoch": 0.00889797390729273, + "flos": 23514056104320.0, + "grad_norm": 11.425879092226545, + "language_loss": 0.96483445, + "learning_rate": 3.1180625252858496e-06, + "loss": 0.99092543, + "num_input_tokens_seen": 1685200, + "step": 74, + "time_per_iteration": 2.8111448287963867 + }, + { + "auxiliary_loss_clip": 0.01420323, + "auxiliary_loss_mlp": 0.01190181, + "balance_loss_clip": 1.19647467, + "balance_loss_mlp": 1.05027723, + "epoch": 0.009018216797931822, + "flos": 23075838178560.0, + "grad_norm": 3.093228681472072, + "language_loss": 0.79953337, + "learning_rate": 3.1277867780021663e-06, + "loss": 0.82563841, + "num_input_tokens_seen": 1701835, + "step": 75, + "time_per_iteration": 2.8523547649383545 + }, + { + "auxiliary_loss_clip": 0.01419734, + "auxiliary_loss_mlp": 0.01188703, + "balance_loss_clip": 1.1959703, + "balance_loss_mlp": 1.04956174, + "epoch": 0.009138459688570914, + "flos": 15918877284480.0, + "grad_norm": 1.877430869802907, + "language_loss": 0.9554143, + "learning_rate": 3.1373822288779824e-06, + "loss": 0.9814986, + "num_input_tokens_seen": 1718415, + "step": 76, + "time_per_iteration": 2.6974122524261475 + }, + { + "auxiliary_loss_clip": 0.01417266, + "auxiliary_loss_mlp": 0.01206117, + "balance_loss_clip": 1.19392109, + "balance_loss_mlp": 1.06707096, + "epoch": 0.009258702579210003, + "flos": 27016531372800.0, + "grad_norm": 2.1768284927469796, + "language_loss": 0.79424381, + "learning_rate": 3.1468522454274533e-06, + "loss": 0.8204776, + "num_input_tokens_seen": 1738770, + "step": 77, + "time_per_iteration": 2.8358237743377686 + }, + { + "auxiliary_loss_clip": 0.01416498, + "auxiliary_loss_mlp": 0.01181805, + "balance_loss_clip": 1.19272888, + "balance_loss_mlp": 1.04514289, + "epoch": 0.009378945469849095, + "flos": 26903196984960.0, + "grad_norm": 2.0850199468849127, + "language_loss": 0.91675854, + "learning_rate": 3.15620006480197e-06, + "loss": 0.94274151, + "num_input_tokens_seen": 1758040, + "step": 78, + "time_per_iteration": 2.90281081199646 + }, + { + "auxiliary_loss_clip": 0.01416348, + "auxiliary_loss_mlp": 0.01187223, + "balance_loss_clip": 1.19319224, + "balance_loss_mlp": 1.05161047, + "epoch": 0.009499188360488187, + "flos": 35694236327040.0, + "grad_norm": 3.4227401044406762, + "language_loss": 0.74701762, + "learning_rate": 3.1654288004333087e-06, + "loss": 0.77305329, + "num_input_tokens_seen": 1776705, + "step": 79, + "time_per_iteration": 2.9553632736206055 + }, + { + "auxiliary_loss_clip": 0.01413388, + "auxiliary_loss_mlp": 0.01173324, + "balance_loss_clip": 1.19005704, + "balance_loss_mlp": 1.03780699, + "epoch": 0.009619431251127276, + "flos": 21503201944320.0, + "grad_norm": 2.4265480357394895, + "language_loss": 0.75872821, + "learning_rate": 3.1745414482589353e-06, + "loss": 0.78459537, + "num_input_tokens_seen": 1795915, + "step": 80, + "time_per_iteration": 2.8047642707824707 + }, + { + "auxiliary_loss_clip": 0.01413491, + "auxiliary_loss_mlp": 0.01171588, + "balance_loss_clip": 1.19037151, + "balance_loss_mlp": 1.03902709, + "epoch": 0.009739674141766368, + "flos": 17421056991360.0, + "grad_norm": 2.811916324543745, + "language_loss": 0.87045991, + "learning_rate": 3.1835408925606204e-06, + "loss": 0.89631069, + "num_input_tokens_seen": 1814055, + "step": 81, + "time_per_iteration": 2.8373162746429443 + }, + { + "auxiliary_loss_clip": 0.01411531, + "auxiliary_loss_mlp": 0.01174681, + "balance_loss_clip": 1.18791723, + "balance_loss_mlp": 1.04183412, + "epoch": 0.00985991703240546, + "flos": 27527109246720.0, + "grad_norm": 2.2482646888177054, + "language_loss": 0.89324033, + "learning_rate": 3.1924299114448214e-06, + "loss": 0.91910243, + "num_input_tokens_seen": 1834535, + "step": 82, + "time_per_iteration": 2.827615976333618 + }, + { + "auxiliary_loss_clip": 0.01410504, + "auxiliary_loss_mlp": 0.0118021, + "balance_loss_clip": 1.1877439, + "balance_loss_mlp": 1.04469275, + "epoch": 0.00998015992304455, + "flos": 13808084509440.0, + "grad_norm": 2.4520638115995452, + "language_loss": 0.83329999, + "learning_rate": 3.2012111819909055e-06, + "loss": 0.85920709, + "num_input_tokens_seen": 1851865, + "step": 83, + "time_per_iteration": 2.7964723110198975 + }, + { + "auxiliary_loss_clip": 0.01408759, + "auxiliary_loss_mlp": 0.01177215, + "balance_loss_clip": 1.18583202, + "balance_loss_mlp": 1.04598975, + "epoch": 0.010100402813683641, + "flos": 20191385341440.0, + "grad_norm": 2.5145398127399265, + "language_loss": 0.94901037, + "learning_rate": 3.2098872850910627e-06, + "loss": 0.97487009, + "num_input_tokens_seen": 1868540, + "step": 84, + "time_per_iteration": 2.825157403945923 + }, + { + "auxiliary_loss_clip": 0.01411256, + "auxiliary_loss_mlp": 0.01170609, + "balance_loss_clip": 1.18806815, + "balance_loss_mlp": 1.04100502, + "epoch": 0.010220645704322733, + "flos": 17201642762880.0, + "grad_norm": 1.891318833638457, + "language_loss": 0.89217162, + "learning_rate": 3.2184607100038194e-06, + "loss": 0.91799027, + "num_input_tokens_seen": 1887180, + "step": 85, + "time_per_iteration": 2.770509719848633 + }, + { + "auxiliary_loss_clip": 0.01406262, + "auxiliary_loss_mlp": 0.01166499, + "balance_loss_clip": 1.18369758, + "balance_loss_mlp": 1.04013765, + "epoch": 0.010340888594961822, + "flos": 21470415805440.0, + "grad_norm": 2.2387972865493126, + "language_loss": 0.93105954, + "learning_rate": 3.2269338586412414e-06, + "loss": 0.95678723, + "num_input_tokens_seen": 1904765, + "step": 86, + "time_per_iteration": 2.8370165824890137 + }, + { + "auxiliary_loss_clip": 0.01406505, + "auxiliary_loss_mlp": 0.01176875, + "balance_loss_clip": 1.1844697, + "balance_loss_mlp": 1.05013132, + "epoch": 0.010461131485600914, + "flos": 23002831785600.0, + "grad_norm": 2.306770087251855, + "language_loss": 0.96299827, + "learning_rate": 3.2353090496083106e-06, + "loss": 0.98883212, + "num_input_tokens_seen": 1922600, + "step": 87, + "time_per_iteration": 2.8137412071228027 + }, + { + "auxiliary_loss_clip": 0.01407054, + "auxiliary_loss_mlp": 0.0117441, + "balance_loss_clip": 1.18488824, + "balance_loss_mlp": 1.04509163, + "epoch": 0.010581374376240005, + "flos": 33546850571520.0, + "grad_norm": 1.923754891343902, + "language_loss": 0.81280327, + "learning_rate": 3.2435885220114572e-06, + "loss": 0.83861792, + "num_input_tokens_seen": 1943950, + "step": 88, + "time_per_iteration": 2.93853497505188 + }, + { + "auxiliary_loss_clip": 0.01408871, + "auxiliary_loss_mlp": 0.01171546, + "balance_loss_clip": 1.18636274, + "balance_loss_mlp": 1.04709136, + "epoch": 0.010701617266879095, + "flos": 21763087822080.0, + "grad_norm": 2.6662158560639235, + "language_loss": 0.93810576, + "learning_rate": 3.2517744390519113e-06, + "loss": 0.96390998, + "num_input_tokens_seen": 1962815, + "step": 89, + "time_per_iteration": 2.8690505027770996 + }, + { + "auxiliary_loss_clip": 0.01401984, + "auxiliary_loss_mlp": 0.01172526, + "balance_loss_clip": 1.1795063, + "balance_loss_mlp": 1.04702282, + "epoch": 0.010821860157518187, + "flos": 19060199256960.0, + "grad_norm": 2.448712184331282, + "language_loss": 0.75114417, + "learning_rate": 3.259868891418298e-06, + "loss": 0.77688932, + "num_input_tokens_seen": 1980580, + "step": 90, + "time_per_iteration": 3.7650296688079834 + }, + { + "auxiliary_loss_clip": 0.01405975, + "auxiliary_loss_mlp": 0.01178885, + "balance_loss_clip": 1.18358552, + "balance_loss_mlp": 1.05252361, + "epoch": 0.010942103048157278, + "flos": 25447378757760.0, + "grad_norm": 3.0069123689400716, + "language_loss": 0.8505733, + "learning_rate": 3.2678739004917757e-06, + "loss": 0.87642193, + "num_input_tokens_seen": 2000315, + "step": 91, + "time_per_iteration": 5.733070135116577 + }, + { + "auxiliary_loss_clip": 0.0140662, + "auxiliary_loss_mlp": 0.01174606, + "balance_loss_clip": 1.18450892, + "balance_loss_mlp": 1.05100965, + "epoch": 0.011062345938796368, + "flos": 27493928058240.0, + "grad_norm": 1.6739458078049858, + "language_loss": 0.91971397, + "learning_rate": 3.275791421376029e-06, + "loss": 0.9455263, + "num_input_tokens_seen": 2023760, + "step": 92, + "time_per_iteration": 2.9331815242767334 + }, + { + "auxiliary_loss_clip": 0.01400607, + "auxiliary_loss_mlp": 0.01166025, + "balance_loss_clip": 1.17824543, + "balance_loss_mlp": 1.0411886, + "epoch": 0.01118258882943546, + "flos": 16071210864000.0, + "grad_norm": 2.2443363483606156, + "language_loss": 0.95818508, + "learning_rate": 3.2836233457634622e-06, + "loss": 0.98385143, + "num_input_tokens_seen": 2041895, + "step": 93, + "time_per_iteration": 2.8107755184173584 + }, + { + "auxiliary_loss_clip": 0.01400388, + "auxiliary_loss_mlp": 0.01171376, + "balance_loss_clip": 1.17816246, + "balance_loss_mlp": 1.04587221, + "epoch": 0.011302831720074551, + "flos": 20668602458880.0, + "grad_norm": 2.3120144791677575, + "language_loss": 0.85259253, + "learning_rate": 3.2913715046481135e-06, + "loss": 0.87831008, + "num_input_tokens_seen": 2061640, + "step": 94, + "time_per_iteration": 2.8159215450286865 + }, + { + "auxiliary_loss_clip": 0.01396881, + "auxiliary_loss_mlp": 0.01156571, + "balance_loss_clip": 1.17459929, + "balance_loss_mlp": 1.0340234, + "epoch": 0.011423074610713641, + "flos": 13072238490240.0, + "grad_norm": 76.82026107708639, + "language_loss": 0.88771152, + "learning_rate": 3.299037670895023e-06, + "loss": 0.91324604, + "num_input_tokens_seen": 2078255, + "step": 95, + "time_per_iteration": 2.7820472717285156 + }, + { + "auxiliary_loss_clip": 0.01400023, + "auxiliary_loss_mlp": 0.01176355, + "balance_loss_clip": 1.17767954, + "balance_loss_mlp": 1.0526638, + "epoch": 0.011543317501352733, + "flos": 30335646689280.0, + "grad_norm": 1.8883941065130354, + "language_loss": 0.80180144, + "learning_rate": 3.3066235616750667e-06, + "loss": 0.82756519, + "num_input_tokens_seen": 2099490, + "step": 96, + "time_per_iteration": 3.0105488300323486 + }, + { + "auxiliary_loss_clip": 0.01396624, + "auxiliary_loss_mlp": 0.01163118, + "balance_loss_clip": 1.17450643, + "balance_loss_mlp": 1.04238296, + "epoch": 0.011663560391991824, + "flos": 15522962601600.0, + "grad_norm": 2.3452271460252927, + "language_loss": 0.92324352, + "learning_rate": 3.3141308407736276e-06, + "loss": 0.94884104, + "num_input_tokens_seen": 2116125, + "step": 97, + "time_per_iteration": 2.8599579334259033 + }, + { + "auxiliary_loss_clip": 0.01392791, + "auxiliary_loss_mlp": 0.01167555, + "balance_loss_clip": 1.17069578, + "balance_loss_mlp": 1.04681969, + "epoch": 0.011783803282630914, + "flos": 19902125116800.0, + "grad_norm": 1.9316580445789986, + "language_loss": 0.86726576, + "learning_rate": 3.321561120780869e-06, + "loss": 0.89286917, + "num_input_tokens_seen": 2134835, + "step": 98, + "time_per_iteration": 2.776670217514038 + }, + { + "auxiliary_loss_clip": 0.0139851, + "auxiliary_loss_mlp": 0.01163525, + "balance_loss_clip": 1.17655826, + "balance_loss_mlp": 1.04507852, + "epoch": 0.011904046173270006, + "flos": 22340674517760.0, + "grad_norm": 2.199292326358966, + "language_loss": 1.01415277, + "learning_rate": 3.3289159651708192e-06, + "loss": 1.03977323, + "num_input_tokens_seen": 2152410, + "step": 99, + "time_per_iteration": 2.9576382637023926 + }, + { + "auxiliary_loss_clip": 0.01391736, + "auxiliary_loss_mlp": 0.01168883, + "balance_loss_clip": 1.16963911, + "balance_loss_mlp": 1.04595447, + "epoch": 0.012024289063909096, + "flos": 19100060375040.0, + "grad_norm": 1.9368873058063174, + "language_loss": 0.97499323, + "learning_rate": 3.3361968902759768e-06, + "loss": 1.00059938, + "num_input_tokens_seen": 2172090, + "step": 100, + "time_per_iteration": 2.908867597579956 + }, + { + "auxiliary_loss_clip": 0.01391999, + "auxiliary_loss_mlp": 0.01167268, + "balance_loss_clip": 1.17027497, + "balance_loss_mlp": 1.04634202, + "epoch": 0.012144531954548187, + "flos": 15012205159680.0, + "grad_norm": 2.2310911808000102, + "language_loss": 0.93769991, + "learning_rate": 3.343405367163663e-06, + "loss": 0.96329248, + "num_input_tokens_seen": 2189020, + "step": 101, + "time_per_iteration": 2.785670518875122 + }, + { + "auxiliary_loss_clip": 0.01390844, + "auxiliary_loss_mlp": 0.01155312, + "balance_loss_clip": 1.16908622, + "balance_loss_mlp": 1.03505337, + "epoch": 0.012264774845187279, + "flos": 15122020014720.0, + "grad_norm": 2.588316934635679, + "language_loss": 0.8125056, + "learning_rate": 3.350542823419951e-06, + "loss": 0.83796716, + "num_input_tokens_seen": 2205620, + "step": 102, + "time_per_iteration": 2.8728392124176025 + }, + { + "auxiliary_loss_clip": 0.01388737, + "auxiliary_loss_mlp": 0.01166834, + "balance_loss_clip": 1.16717052, + "balance_loss_mlp": 1.0478158, + "epoch": 0.012385017735826368, + "flos": 13949248959360.0, + "grad_norm": 3.2288643962070007, + "language_loss": 0.87444055, + "learning_rate": 3.3576106448465615e-06, + "loss": 0.89999622, + "num_input_tokens_seen": 2219000, + "step": 103, + "time_per_iteration": 2.737010955810547 + }, + { + "auxiliary_loss_clip": 0.01383978, + "auxiliary_loss_mlp": 0.01160484, + "balance_loss_clip": 1.16293573, + "balance_loss_mlp": 1.04184651, + "epoch": 0.01250526062646546, + "flos": 23623260428160.0, + "grad_norm": 2.9367768714339033, + "language_loss": 0.88141727, + "learning_rate": 3.3646101770757797e-06, + "loss": 0.90686184, + "num_input_tokens_seen": 2237790, + "step": 104, + "time_per_iteration": 2.8882956504821777 + }, + { + "auxiliary_loss_clip": 0.01388899, + "auxiliary_loss_mlp": 0.01158002, + "balance_loss_clip": 1.16775489, + "balance_loss_mlp": 1.04117668, + "epoch": 0.012625503517104552, + "flos": 34640078958720.0, + "grad_norm": 2.150551720871757, + "language_loss": 0.85594261, + "learning_rate": 3.371542727108104e-06, + "loss": 0.88141155, + "num_input_tokens_seen": 2259965, + "step": 105, + "time_per_iteration": 2.9317128658294678 + }, + { + "auxiliary_loss_clip": 0.01386201, + "auxiliary_loss_mlp": 0.01158795, + "balance_loss_clip": 1.16474771, + "balance_loss_mlp": 1.04292381, + "epoch": 0.012745746407743641, + "flos": 17821891837440.0, + "grad_norm": 2.500142723963747, + "language_loss": 0.89986807, + "learning_rate": 3.3784095647770114e-06, + "loss": 0.925318, + "num_input_tokens_seen": 2278610, + "step": 106, + "time_per_iteration": 2.876786231994629 + }, + { + "auxiliary_loss_clip": 0.01383704, + "auxiliary_loss_mlp": 0.0116439, + "balance_loss_clip": 1.16225398, + "balance_loss_mlp": 1.04699254, + "epoch": 0.012865989298382733, + "flos": 20595057361920.0, + "grad_norm": 2.3518014310576563, + "language_loss": 0.8847115, + "learning_rate": 3.3852119241449547e-06, + "loss": 0.91019237, + "num_input_tokens_seen": 2297730, + "step": 107, + "time_per_iteration": 2.890155792236328 + }, + { + "auxiliary_loss_clip": 0.0138394, + "auxiliary_loss_mlp": 0.01151609, + "balance_loss_clip": 1.16247189, + "balance_loss_mlp": 1.03630972, + "epoch": 0.012986232189021825, + "flos": 23948969978880.0, + "grad_norm": 3.873014593633782, + "language_loss": 0.96117347, + "learning_rate": 3.3919510048344295e-06, + "loss": 0.98652899, + "num_input_tokens_seen": 2315740, + "step": 108, + "time_per_iteration": 2.805663585662842 + }, + { + "auxiliary_loss_clip": 0.01381728, + "auxiliary_loss_mlp": 0.01160388, + "balance_loss_clip": 1.16068923, + "balance_loss_mlp": 1.04499316, + "epoch": 0.013106475079660914, + "flos": 23725425686400.0, + "grad_norm": 2.0250318992755636, + "language_loss": 0.86424148, + "learning_rate": 3.3986279732976907e-06, + "loss": 0.88966262, + "num_input_tokens_seen": 2334215, + "step": 109, + "time_per_iteration": 2.894200325012207 + }, + { + "auxiliary_loss_clip": 0.01383123, + "auxiliary_loss_mlp": 0.01155702, + "balance_loss_clip": 1.16219163, + "balance_loss_mlp": 1.04145241, + "epoch": 0.013226717970300006, + "flos": 21102438925440.0, + "grad_norm": 2.1681844817835207, + "language_loss": 0.95390075, + "learning_rate": 3.4052439640284983e-06, + "loss": 0.979289, + "num_input_tokens_seen": 2353130, + "step": 110, + "time_per_iteration": 2.9180502891540527 + }, + { + "auxiliary_loss_clip": 0.01380359, + "auxiliary_loss_mlp": 0.01153779, + "balance_loss_clip": 1.15971041, + "balance_loss_mlp": 1.03991032, + "epoch": 0.013346960860939098, + "flos": 24863902231680.0, + "grad_norm": 1.8938215262592129, + "language_loss": 0.81052971, + "learning_rate": 3.4118000807190217e-06, + "loss": 0.8358711, + "num_input_tokens_seen": 2374010, + "step": 111, + "time_per_iteration": 3.026261568069458 + }, + { + "auxiliary_loss_clip": 0.01378261, + "auxiliary_loss_mlp": 0.01157167, + "balance_loss_clip": 1.15748382, + "balance_loss_mlp": 1.04129517, + "epoch": 0.013467203751578187, + "flos": 28181940140160.0, + "grad_norm": 1.712020702474799, + "language_loss": 0.76024407, + "learning_rate": 3.4182973973648723e-06, + "loss": 0.78559834, + "num_input_tokens_seen": 2395220, + "step": 112, + "time_per_iteration": 2.793267250061035 + }, + { + "auxiliary_loss_clip": 0.01379825, + "auxiliary_loss_mlp": 0.01163494, + "balance_loss_clip": 1.15953183, + "balance_loss_mlp": 1.0504837, + "epoch": 0.013587446642217279, + "flos": 18916233546240.0, + "grad_norm": 3.153929654597867, + "language_loss": 0.94874847, + "learning_rate": 3.424736959321014e-06, + "loss": 0.97418165, + "num_input_tokens_seen": 2413025, + "step": 113, + "time_per_iteration": 2.784010171890259 + }, + { + "auxiliary_loss_clip": 0.01378959, + "auxiliary_loss_mlp": 0.01154914, + "balance_loss_clip": 1.15876055, + "balance_loss_mlp": 1.04142737, + "epoch": 0.01370768953285637, + "flos": 23988615615360.0, + "grad_norm": 2.4244590127590393, + "language_loss": 0.88614738, + "learning_rate": 3.431119784311155e-06, + "loss": 0.91148603, + "num_input_tokens_seen": 2432700, + "step": 114, + "time_per_iteration": 2.9280476570129395 + }, + { + "auxiliary_loss_clip": 0.01378742, + "auxiliary_loss_mlp": 0.01163239, + "balance_loss_clip": 1.15849972, + "balance_loss_mlp": 1.05061007, + "epoch": 0.01382793242349546, + "flos": 39202565512320.0, + "grad_norm": 1.8331530433321483, + "language_loss": 0.77393633, + "learning_rate": 3.43744686339307e-06, + "loss": 0.7993561, + "num_input_tokens_seen": 2455020, + "step": 115, + "time_per_iteration": 2.886049270629883 + }, + { + "auxiliary_loss_clip": 0.01370156, + "auxiliary_loss_mlp": 0.01148213, + "balance_loss_clip": 1.14932179, + "balance_loss_mlp": 1.03482151, + "epoch": 0.013948175314134552, + "flos": 41353506714240.0, + "grad_norm": 2.250706063465028, + "language_loss": 0.90700191, + "learning_rate": 3.44371916188212e-06, + "loss": 0.93218559, + "num_input_tokens_seen": 2475775, + "step": 116, + "time_per_iteration": 4.5577569007873535 + }, + { + "auxiliary_loss_clip": 0.01371261, + "auxiliary_loss_mlp": 0.01144741, + "balance_loss_clip": 1.15090895, + "balance_loss_mlp": 1.0320164, + "epoch": 0.014068418204773643, + "flos": 22453542028800.0, + "grad_norm": 2.0596505749420753, + "language_loss": 0.86151552, + "learning_rate": 3.449937620235143e-06, + "loss": 0.88667548, + "num_input_tokens_seen": 2496370, + "step": 117, + "time_per_iteration": 4.58258581161499 + }, + { + "auxiliary_loss_clip": 0.01371794, + "auxiliary_loss_mlp": 0.0115879, + "balance_loss_clip": 1.1518929, + "balance_loss_mlp": 1.04568446, + "epoch": 0.014188661095412733, + "flos": 23805147922560.0, + "grad_norm": 1.7744579235685403, + "language_loss": 0.8917042, + "learning_rate": 3.456103154896722e-06, + "loss": 0.91701007, + "num_input_tokens_seen": 2517645, + "step": 118, + "time_per_iteration": 2.872995376586914 + }, + { + "auxiliary_loss_clip": 0.01372098, + "auxiliary_loss_mlp": 0.01150684, + "balance_loss_clip": 1.15258861, + "balance_loss_mlp": 1.03919935, + "epoch": 0.014308903986051825, + "flos": 23660248458240.0, + "grad_norm": 1.8726319578647637, + "language_loss": 0.9248898, + "learning_rate": 3.462216659109757e-06, + "loss": 0.95011759, + "num_input_tokens_seen": 2537825, + "step": 119, + "time_per_iteration": 2.7920894622802734 + }, + { + "auxiliary_loss_clip": 0.01373715, + "auxiliary_loss_mlp": 0.01146637, + "balance_loss_clip": 1.15318298, + "balance_loss_mlp": 1.03715491, + "epoch": 0.014429146876690916, + "flos": 20667991927680.0, + "grad_norm": 2.2291811121750107, + "language_loss": 0.85403317, + "learning_rate": 3.4682790036921077e-06, + "loss": 0.8792367, + "num_input_tokens_seen": 2556485, + "step": 120, + "time_per_iteration": 2.9859273433685303 + }, + { + "auxiliary_loss_clip": 0.01370396, + "auxiliary_loss_mlp": 0.0115738, + "balance_loss_clip": 1.15055656, + "balance_loss_mlp": 1.04732645, + "epoch": 0.014549389767330006, + "flos": 20229199384320.0, + "grad_norm": 2.0034101621460656, + "language_loss": 0.83075976, + "learning_rate": 3.4742910377810193e-06, + "loss": 0.8560375, + "num_input_tokens_seen": 2573945, + "step": 121, + "time_per_iteration": 2.7963881492614746 + }, + { + "auxiliary_loss_clip": 0.01369686, + "auxiliary_loss_mlp": 0.01158609, + "balance_loss_clip": 1.14975667, + "balance_loss_mlp": 1.04989052, + "epoch": 0.014669632657969098, + "flos": 18004174381440.0, + "grad_norm": 2.0437635839933046, + "language_loss": 0.88630557, + "learning_rate": 3.4802535895469042e-06, + "loss": 0.91158855, + "num_input_tokens_seen": 2592695, + "step": 122, + "time_per_iteration": 2.8678855895996094 + }, + { + "auxiliary_loss_clip": 0.013669, + "auxiliary_loss_mlp": 0.01143708, + "balance_loss_clip": 1.14661145, + "balance_loss_mlp": 1.03336835, + "epoch": 0.01478987554860819, + "flos": 22741796672640.0, + "grad_norm": 2.1688782895778616, + "language_loss": 0.89473212, + "learning_rate": 3.4861674668779934e-06, + "loss": 0.91983819, + "num_input_tokens_seen": 2610925, + "step": 123, + "time_per_iteration": 2.7793285846710205 + }, + { + "auxiliary_loss_clip": 0.01367455, + "auxiliary_loss_mlp": 0.01140495, + "balance_loss_clip": 1.14719486, + "balance_loss_mlp": 1.03025055, + "epoch": 0.01491011843924728, + "flos": 17198590106880.0, + "grad_norm": 1.9781284872904201, + "language_loss": 0.84233141, + "learning_rate": 3.492033458037272e-06, + "loss": 0.8674109, + "num_input_tokens_seen": 2629495, + "step": 124, + "time_per_iteration": 2.7881312370300293 + }, + { + "auxiliary_loss_clip": 0.01361898, + "auxiliary_loss_mlp": 0.01148976, + "balance_loss_clip": 1.14203739, + "balance_loss_mlp": 1.0388267, + "epoch": 0.01503036132988637, + "flos": 17673867889920.0, + "grad_norm": 3.3474901352198176, + "language_loss": 0.86812663, + "learning_rate": 3.497852332293018e-06, + "loss": 0.89323533, + "num_input_tokens_seen": 2645070, + "step": 125, + "time_per_iteration": 2.7670373916625977 + }, + { + "auxiliary_loss_clip": 0.01366282, + "auxiliary_loss_mlp": 0.0114184, + "balance_loss_clip": 1.14598823, + "balance_loss_mlp": 1.03455138, + "epoch": 0.015150604220525462, + "flos": 18878239935360.0, + "grad_norm": 6.636974019910855, + "language_loss": 0.96509111, + "learning_rate": 3.5036248405242356e-06, + "loss": 0.99017233, + "num_input_tokens_seen": 2663825, + "step": 126, + "time_per_iteration": 2.6907382011413574 + }, + { + "auxiliary_loss_clip": 0.01363704, + "auxiliary_loss_mlp": 0.01143872, + "balance_loss_clip": 1.14402819, + "balance_loss_mlp": 1.03496242, + "epoch": 0.015270847111164552, + "flos": 39420184060800.0, + "grad_norm": 2.5966998213872334, + "language_loss": 0.82572722, + "learning_rate": 3.509351715802146e-06, + "loss": 0.85080296, + "num_input_tokens_seen": 2684710, + "step": 127, + "time_per_iteration": 2.9555139541625977 + }, + { + "auxiliary_loss_clip": 0.01359433, + "auxiliary_loss_mlp": 0.01145721, + "balance_loss_clip": 1.14051759, + "balance_loss_mlp": 1.03738403, + "epoch": 0.015391090001803644, + "flos": 43762466286720.0, + "grad_norm": 2.372542826070976, + "language_loss": 0.78340715, + "learning_rate": 3.5150336739488763e-06, + "loss": 0.80845869, + "num_input_tokens_seen": 2706995, + "step": 128, + "time_per_iteration": 2.9804131984710693 + }, + { + "auxiliary_loss_clip": 0.01364024, + "auxiliary_loss_mlp": 0.0114468, + "balance_loss_clip": 1.1440419, + "balance_loss_mlp": 1.03624678, + "epoch": 0.015511332892442733, + "flos": 18916341287040.0, + "grad_norm": 1.9823584902926124, + "language_loss": 0.84050113, + "learning_rate": 3.5206714140744143e-06, + "loss": 0.86558813, + "num_input_tokens_seen": 2727050, + "step": 129, + "time_per_iteration": 2.9966650009155273 + }, + { + "auxiliary_loss_clip": 0.01362804, + "auxiliary_loss_mlp": 0.01151668, + "balance_loss_clip": 1.14328408, + "balance_loss_mlp": 1.04399824, + "epoch": 0.015631575783081827, + "flos": 24535283679360.0, + "grad_norm": 2.5414119330907283, + "language_loss": 0.87636864, + "learning_rate": 3.5262656190928208e-06, + "loss": 0.90151346, + "num_input_tokens_seen": 2745350, + "step": 130, + "time_per_iteration": 2.983201742172241 + }, + { + "auxiliary_loss_clip": 0.01418939, + "auxiliary_loss_mlp": 0.01120852, + "balance_loss_clip": 1.201671, + "balance_loss_mlp": 1.03273296, + "epoch": 0.015751818673720917, + "flos": 62328536098560.0, + "grad_norm": 1.0489922453485585, + "language_loss": 0.71513927, + "learning_rate": 3.5318169562186737e-06, + "loss": 0.74053717, + "num_input_tokens_seen": 2814195, + "step": 131, + "time_per_iteration": 3.5059289932250977 + }, + { + "auxiliary_loss_clip": 0.01358075, + "auxiliary_loss_mlp": 0.0115591, + "balance_loss_clip": 1.13896132, + "balance_loss_mlp": 1.04757261, + "epoch": 0.015872061564360006, + "flos": 23878549365120.0, + "grad_norm": 1.8269937368252842, + "language_loss": 0.82000029, + "learning_rate": 3.5373260774446292e-06, + "loss": 0.8451401, + "num_input_tokens_seen": 2834645, + "step": 132, + "time_per_iteration": 2.867751121520996 + }, + { + "auxiliary_loss_clip": 0.0135963, + "auxiliary_loss_mlp": 0.01141187, + "balance_loss_clip": 1.13975883, + "balance_loss_mlp": 1.03332686, + "epoch": 0.0159923044549991, + "flos": 23367899664000.0, + "grad_norm": 1.850198007559345, + "language_loss": 0.90389431, + "learning_rate": 3.542793620000961e-06, + "loss": 0.92890251, + "num_input_tokens_seen": 2854120, + "step": 133, + "time_per_iteration": 2.7973148822784424 + }, + { + "auxiliary_loss_clip": 0.01358099, + "auxiliary_loss_mlp": 0.01153143, + "balance_loss_clip": 1.13942111, + "balance_loss_mlp": 1.04509163, + "epoch": 0.01611254734563819, + "flos": 17858305249920.0, + "grad_norm": 2.096978967924228, + "language_loss": 0.86865306, + "learning_rate": 3.5482202067978894e-06, + "loss": 0.89376551, + "num_input_tokens_seen": 2871330, + "step": 134, + "time_per_iteration": 2.825521945953369 + }, + { + "auxiliary_loss_clip": 0.01359446, + "auxiliary_loss_mlp": 0.01144198, + "balance_loss_clip": 1.14047182, + "balance_loss_mlp": 1.03834033, + "epoch": 0.01623279023627728, + "flos": 20954774113920.0, + "grad_norm": 3.3078137879564977, + "language_loss": 0.76042205, + "learning_rate": 3.553606446851471e-06, + "loss": 0.78545845, + "num_input_tokens_seen": 2888070, + "step": 135, + "time_per_iteration": 2.8413732051849365 + }, + { + "auxiliary_loss_clip": 0.01355842, + "auxiliary_loss_mlp": 0.01142793, + "balance_loss_clip": 1.13697159, + "balance_loss_mlp": 1.03731644, + "epoch": 0.016353033126916373, + "flos": 15742412743680.0, + "grad_norm": 1.879316342481586, + "language_loss": 0.83420658, + "learning_rate": 3.5589529356937613e-06, + "loss": 0.85919285, + "num_input_tokens_seen": 2906465, + "step": 136, + "time_per_iteration": 2.825993299484253 + }, + { + "auxiliary_loss_clip": 0.01353052, + "auxiliary_loss_mlp": 0.01143978, + "balance_loss_clip": 1.13440299, + "balance_loss_mlp": 1.03659439, + "epoch": 0.016473276017555463, + "flos": 18807280617600.0, + "grad_norm": 1.7249368901533138, + "language_loss": 0.77025783, + "learning_rate": 3.5642602557679627e-06, + "loss": 0.79522812, + "num_input_tokens_seen": 2924915, + "step": 137, + "time_per_iteration": 2.8423595428466797 + }, + { + "auxiliary_loss_clip": 0.01358473, + "auxiliary_loss_mlp": 0.01147278, + "balance_loss_clip": 1.13975859, + "balance_loss_mlp": 1.04180193, + "epoch": 0.016593518908194552, + "flos": 24352641999360.0, + "grad_norm": 3.3705017144998854, + "language_loss": 0.84206325, + "learning_rate": 3.569528976809202e-06, + "loss": 0.86712074, + "num_input_tokens_seen": 2942130, + "step": 138, + "time_per_iteration": 2.849653482437134 + }, + { + "auxiliary_loss_clip": 0.01351516, + "auxiliary_loss_mlp": 0.01143243, + "balance_loss_clip": 1.13299704, + "balance_loss_mlp": 1.03872061, + "epoch": 0.016713761798833646, + "flos": 22346133384960.0, + "grad_norm": 2.108243990735047, + "language_loss": 0.89761543, + "learning_rate": 3.5747596562115522e-06, + "loss": 0.92256296, + "num_input_tokens_seen": 2962745, + "step": 139, + "time_per_iteration": 2.8003170490264893 + }, + { + "auxiliary_loss_clip": 0.01351663, + "auxiliary_loss_mlp": 0.01137528, + "balance_loss_clip": 1.13328898, + "balance_loss_mlp": 1.03186083, + "epoch": 0.016834004689472735, + "flos": 17821820010240.0, + "grad_norm": 3.702279019707138, + "language_loss": 0.90862608, + "learning_rate": 3.5799528393819138e-06, + "loss": 0.93351805, + "num_input_tokens_seen": 2981825, + "step": 140, + "time_per_iteration": 2.7428550720214844 + }, + { + "auxiliary_loss_clip": 0.01350113, + "auxiliary_loss_mlp": 0.01141525, + "balance_loss_clip": 1.13238144, + "balance_loss_mlp": 1.03633475, + "epoch": 0.016954247580111825, + "flos": 20519501103360.0, + "grad_norm": 9.476208079084499, + "language_loss": 0.88011456, + "learning_rate": 3.585109060081286e-06, + "loss": 0.90503103, + "num_input_tokens_seen": 3001625, + "step": 141, + "time_per_iteration": 2.803173303604126 + }, + { + "auxiliary_loss_clip": 0.01349232, + "auxiliary_loss_mlp": 0.0114464, + "balance_loss_clip": 1.13136029, + "balance_loss_mlp": 1.03887773, + "epoch": 0.017074490470750915, + "flos": 22088869200000.0, + "grad_norm": 1.9471377253637756, + "language_loss": 0.78428113, + "learning_rate": 3.590228840753992e-06, + "loss": 0.80921984, + "num_input_tokens_seen": 3022055, + "step": 142, + "time_per_iteration": 4.759372234344482 + }, + { + "auxiliary_loss_clip": 0.01349886, + "auxiliary_loss_mlp": 0.01135817, + "balance_loss_clip": 1.13185072, + "balance_loss_mlp": 1.03158021, + "epoch": 0.01719473336139001, + "flos": 15997270717440.0, + "grad_norm": 2.406452148965106, + "language_loss": 0.8730737, + "learning_rate": 3.5953126928453423e-06, + "loss": 0.89793068, + "num_input_tokens_seen": 3039605, + "step": 143, + "time_per_iteration": 2.7330703735351562 + }, + { + "auxiliary_loss_clip": 0.0134951, + "auxiliary_loss_mlp": 0.01132646, + "balance_loss_clip": 1.13208115, + "balance_loss_mlp": 1.0296489, + "epoch": 0.017314976252029098, + "flos": 22492038430080.0, + "grad_norm": 2.3275859334947655, + "language_loss": 0.80464607, + "learning_rate": 3.600361117108239e-06, + "loss": 0.82946765, + "num_input_tokens_seen": 3059405, + "step": 144, + "time_per_iteration": 2.830112934112549 + }, + { + "auxiliary_loss_clip": 0.01349097, + "auxiliary_loss_mlp": 0.01137945, + "balance_loss_clip": 1.13114047, + "balance_loss_mlp": 1.03456688, + "epoch": 0.017435219142668188, + "flos": 22018053536640.0, + "grad_norm": 1.9458841360426224, + "language_loss": 0.97240883, + "learning_rate": 3.6053746038991616e-06, + "loss": 0.99727929, + "num_input_tokens_seen": 3078490, + "step": 145, + "time_per_iteration": 2.7302958965301514 + }, + { + "auxiliary_loss_clip": 0.01393907, + "auxiliary_loss_mlp": 0.01088298, + "balance_loss_clip": 1.17873657, + "balance_loss_mlp": 1.00208569, + "epoch": 0.01755546203330728, + "flos": 72240526149120.0, + "grad_norm": 1.0648598288022697, + "language_loss": 0.58417141, + "learning_rate": 3.6103536334639843e-06, + "loss": 0.60899341, + "num_input_tokens_seen": 3131755, + "step": 146, + "time_per_iteration": 3.3215689659118652 + }, + { + "auxiliary_loss_clip": 0.01346135, + "auxiliary_loss_mlp": 0.01133646, + "balance_loss_clip": 1.12815034, + "balance_loss_mlp": 1.0300765, + "epoch": 0.01767570492394637, + "flos": 25337061112320.0, + "grad_norm": 4.403396807708519, + "language_loss": 0.85522997, + "learning_rate": 3.615298676214041e-06, + "loss": 0.88002777, + "num_input_tokens_seen": 3152035, + "step": 147, + "time_per_iteration": 2.839264154434204 + }, + { + "auxiliary_loss_clip": 0.01342651, + "auxiliary_loss_mlp": 0.01137297, + "balance_loss_clip": 1.12551475, + "balance_loss_mlp": 1.03353727, + "epoch": 0.01779594781458546, + "flos": 20449188230400.0, + "grad_norm": 2.1043550210518362, + "language_loss": 0.88773173, + "learning_rate": 3.6202101929928317e-06, + "loss": 0.91253126, + "num_input_tokens_seen": 3170625, + "step": 148, + "time_per_iteration": 2.8033108711242676 + }, + { + "auxiliary_loss_clip": 0.01344954, + "auxiliary_loss_mlp": 0.01135221, + "balance_loss_clip": 1.12744594, + "balance_loss_mlp": 1.03270078, + "epoch": 0.017916190705224554, + "flos": 16253601148800.0, + "grad_norm": 2.351767560624089, + "language_loss": 0.88590062, + "learning_rate": 3.6250886353337413e-06, + "loss": 0.91070235, + "num_input_tokens_seen": 3188155, + "step": 149, + "time_per_iteration": 2.808407783508301 + }, + { + "auxiliary_loss_clip": 0.01344893, + "auxiliary_loss_mlp": 0.01138293, + "balance_loss_clip": 1.12759805, + "balance_loss_mlp": 1.0352006, + "epoch": 0.018036433595863644, + "flos": 23330588411520.0, + "grad_norm": 2.5788046155892994, + "language_loss": 0.86303937, + "learning_rate": 3.6299344457091488e-06, + "loss": 0.88787127, + "num_input_tokens_seen": 3209015, + "step": 150, + "time_per_iteration": 2.842949628829956 + }, + { + "auxiliary_loss_clip": 0.01343654, + "auxiliary_loss_mlp": 0.01129863, + "balance_loss_clip": 1.12664247, + "balance_loss_mlp": 1.02781963, + "epoch": 0.018156676486502734, + "flos": 18588010043520.0, + "grad_norm": 3.997541015097387, + "language_loss": 0.93687409, + "learning_rate": 3.634748057771256e-06, + "loss": 0.96160924, + "num_input_tokens_seen": 3224955, + "step": 151, + "time_per_iteration": 2.8504397869110107 + }, + { + "auxiliary_loss_clip": 0.01342011, + "auxiliary_loss_mlp": 0.01136653, + "balance_loss_clip": 1.12520742, + "balance_loss_mlp": 1.03518176, + "epoch": 0.018276919377141827, + "flos": 25448707560960.0, + "grad_norm": 1.8509805864124225, + "language_loss": 0.85640275, + "learning_rate": 3.639529896584965e-06, + "loss": 0.88118935, + "num_input_tokens_seen": 3246330, + "step": 152, + "time_per_iteration": 2.8738725185394287 + }, + { + "auxiliary_loss_clip": 0.0134437, + "auxiliary_loss_mlp": 0.01135096, + "balance_loss_clip": 1.12756324, + "balance_loss_mlp": 1.03238511, + "epoch": 0.018397162267780917, + "flos": 20047311889920.0, + "grad_norm": 3.0556267038809364, + "language_loss": 0.88773161, + "learning_rate": 3.6442803788531233e-06, + "loss": 0.91252631, + "num_input_tokens_seen": 3264290, + "step": 153, + "time_per_iteration": 2.887213945388794 + }, + { + "auxiliary_loss_clip": 0.01341433, + "auxiliary_loss_mlp": 0.01137363, + "balance_loss_clip": 1.12468839, + "balance_loss_mlp": 1.03446114, + "epoch": 0.018517405158420007, + "flos": 27565282425600.0, + "grad_norm": 3.499632471091065, + "language_loss": 0.95896065, + "learning_rate": 3.6489999131344357e-06, + "loss": 0.98374856, + "num_input_tokens_seen": 3287065, + "step": 154, + "time_per_iteration": 2.8408541679382324 + }, + { + "auxiliary_loss_clip": 0.01342822, + "auxiliary_loss_mlp": 0.01132716, + "balance_loss_clip": 1.12662697, + "balance_loss_mlp": 1.03215075, + "epoch": 0.0186376480490591, + "flos": 19354056422400.0, + "grad_norm": 1.7899159892983167, + "language_loss": 0.90547115, + "learning_rate": 3.653688900054313e-06, + "loss": 0.93022656, + "num_input_tokens_seen": 3305595, + "step": 155, + "time_per_iteration": 2.760671854019165 + }, + { + "auxiliary_loss_clip": 0.01339504, + "auxiliary_loss_mlp": 0.01131732, + "balance_loss_clip": 1.12310672, + "balance_loss_mlp": 1.03054714, + "epoch": 0.01875789093969819, + "flos": 26687840993280.0, + "grad_norm": 2.581247147368395, + "language_loss": 0.76163793, + "learning_rate": 3.6583477325089526e-06, + "loss": 0.78635025, + "num_input_tokens_seen": 3326135, + "step": 156, + "time_per_iteration": 2.8282296657562256 + }, + { + "auxiliary_loss_clip": 0.01341554, + "auxiliary_loss_mlp": 0.01135868, + "balance_loss_clip": 1.12552512, + "balance_loss_mlp": 1.03372979, + "epoch": 0.01887813383033728, + "flos": 24353001135360.0, + "grad_norm": 2.379908616175745, + "language_loss": 1.04315329, + "learning_rate": 3.6629767958628916e-06, + "loss": 1.06792748, + "num_input_tokens_seen": 3343510, + "step": 157, + "time_per_iteration": 2.7303106784820557 + }, + { + "auxiliary_loss_clip": 0.01341772, + "auxiliary_loss_mlp": 0.01130041, + "balance_loss_clip": 1.12528682, + "balance_loss_mlp": 1.02914214, + "epoch": 0.018998376720976373, + "flos": 14647532330880.0, + "grad_norm": 2.1664713415036667, + "language_loss": 0.85580623, + "learning_rate": 3.667576468140291e-06, + "loss": 0.8805244, + "num_input_tokens_seen": 3361325, + "step": 158, + "time_per_iteration": 2.772573471069336 + }, + { + "auxiliary_loss_clip": 0.01338018, + "auxiliary_loss_mlp": 0.01133713, + "balance_loss_clip": 1.12185466, + "balance_loss_mlp": 1.03300548, + "epoch": 0.019118619611615463, + "flos": 29305261146240.0, + "grad_norm": 4.452429527119331, + "language_loss": 0.89010525, + "learning_rate": 3.672147120210184e-06, + "loss": 0.91482252, + "num_input_tokens_seen": 3377925, + "step": 159, + "time_per_iteration": 2.854275703430176 + }, + { + "auxiliary_loss_clip": 0.01342301, + "auxiliary_loss_mlp": 0.01128619, + "balance_loss_clip": 1.12631667, + "balance_loss_mlp": 1.02819753, + "epoch": 0.019238862502254553, + "flos": 20886723797760.0, + "grad_norm": 2.4486506605396396, + "language_loss": 0.864236, + "learning_rate": 3.6766891159659177e-06, + "loss": 0.88894522, + "num_input_tokens_seen": 3396335, + "step": 160, + "time_per_iteration": 2.813072443008423 + }, + { + "auxiliary_loss_clip": 0.01340183, + "auxiliary_loss_mlp": 0.01130578, + "balance_loss_clip": 1.12437499, + "balance_loss_mlp": 1.03044271, + "epoch": 0.019359105392893646, + "flos": 21360672777600.0, + "grad_norm": 3.3697172558476014, + "language_loss": 0.88121206, + "learning_rate": 3.6812028124990075e-06, + "loss": 0.90591967, + "num_input_tokens_seen": 3413605, + "step": 161, + "time_per_iteration": 2.777724027633667 + }, + { + "auxiliary_loss_clip": 0.0133619, + "auxiliary_loss_mlp": 0.01125676, + "balance_loss_clip": 1.12059152, + "balance_loss_mlp": 1.02658916, + "epoch": 0.019479348283532736, + "flos": 16283729681280.0, + "grad_norm": 3.2409877312175808, + "language_loss": 0.8157196, + "learning_rate": 3.6856885602676016e-06, + "loss": 0.84033829, + "num_input_tokens_seen": 3429640, + "step": 162, + "time_per_iteration": 2.7174642086029053 + }, + { + "auxiliary_loss_clip": 0.01335997, + "auxiliary_loss_mlp": 0.01128666, + "balance_loss_clip": 1.11990893, + "balance_loss_mlp": 1.02838683, + "epoch": 0.019599591174171826, + "flos": 22091239497600.0, + "grad_norm": 3.2188261298597007, + "language_loss": 0.93934202, + "learning_rate": 3.6901467032597733e-06, + "loss": 0.96398854, + "num_input_tokens_seen": 3448125, + "step": 163, + "time_per_iteration": 2.869816303253174 + }, + { + "auxiliary_loss_clip": 0.01336163, + "auxiliary_loss_mlp": 0.01124341, + "balance_loss_clip": 1.12098742, + "balance_loss_mlp": 1.0243485, + "epoch": 0.01971983406481092, + "flos": 19609668581760.0, + "grad_norm": 2.067066381877545, + "language_loss": 0.87361097, + "learning_rate": 3.694577579151804e-06, + "loss": 0.89821601, + "num_input_tokens_seen": 3466535, + "step": 164, + "time_per_iteration": 2.814781904220581 + }, + { + "auxiliary_loss_clip": 0.01337338, + "auxiliary_loss_mlp": 0.0112839, + "balance_loss_clip": 1.12162948, + "balance_loss_mlp": 1.0291127, + "epoch": 0.01984007695545001, + "flos": 19099342103040.0, + "grad_norm": 2.4736806489184735, + "language_loss": 0.73652864, + "learning_rate": 3.6989815194616703e-06, + "loss": 0.761186, + "num_input_tokens_seen": 3483730, + "step": 165, + "time_per_iteration": 2.840736150741577 + }, + { + "auxiliary_loss_clip": 0.01333591, + "auxiliary_loss_mlp": 0.01135783, + "balance_loss_clip": 1.11841869, + "balance_loss_mlp": 1.03412163, + "epoch": 0.0199603198460891, + "flos": 20848406964480.0, + "grad_norm": 2.243953564258661, + "language_loss": 0.79665756, + "learning_rate": 3.703358849697888e-06, + "loss": 0.82135135, + "num_input_tokens_seen": 3503640, + "step": 166, + "time_per_iteration": 2.8070619106292725 + }, + { + "auxiliary_loss_clip": 0.01335582, + "auxiliary_loss_mlp": 0.01123432, + "balance_loss_clip": 1.12024951, + "balance_loss_mlp": 1.02491748, + "epoch": 0.020080562736728192, + "flos": 21870747861120.0, + "grad_norm": 1.8411319191250048, + "language_loss": 0.82565945, + "learning_rate": 3.7077098895038803e-06, + "loss": 0.85024959, + "num_input_tokens_seen": 3523010, + "step": 167, + "time_per_iteration": 2.8144023418426514 + }, + { + "auxiliary_loss_clip": 0.01336708, + "auxiliary_loss_mlp": 0.0112654, + "balance_loss_clip": 1.12104094, + "balance_loss_mlp": 1.02659464, + "epoch": 0.020200805627367282, + "flos": 21688788539520.0, + "grad_norm": 2.5044520468965747, + "language_loss": 0.96792376, + "learning_rate": 3.712034952798045e-06, + "loss": 0.99255621, + "num_input_tokens_seen": 3541125, + "step": 168, + "time_per_iteration": 4.675708055496216 + }, + { + "auxiliary_loss_clip": 0.01334779, + "auxiliary_loss_mlp": 0.01128392, + "balance_loss_clip": 1.11928272, + "balance_loss_mlp": 1.02744508, + "epoch": 0.02032104851800637, + "flos": 33543043729920.0, + "grad_norm": 2.4220138736464962, + "language_loss": 0.84644723, + "learning_rate": 3.7163343479096656e-06, + "loss": 0.87107897, + "num_input_tokens_seen": 3562700, + "step": 169, + "time_per_iteration": 3.866880178451538 + }, + { + "auxiliary_loss_clip": 0.01333939, + "auxiliary_loss_mlp": 0.01123703, + "balance_loss_clip": 1.1187973, + "balance_loss_mlp": 1.02542675, + "epoch": 0.020441291408645465, + "flos": 31686965274240.0, + "grad_norm": 4.2777615878369275, + "language_loss": 0.82928348, + "learning_rate": 3.720608377710802e-06, + "loss": 0.85385984, + "num_input_tokens_seen": 3582790, + "step": 170, + "time_per_iteration": 2.8310787677764893 + }, + { + "auxiliary_loss_clip": 0.01333434, + "auxiliary_loss_mlp": 0.01128692, + "balance_loss_clip": 1.11965299, + "balance_loss_mlp": 1.02965331, + "epoch": 0.020561534299284555, + "flos": 20886687884160.0, + "grad_norm": 2.270151950420398, + "language_loss": 0.86663616, + "learning_rate": 3.7248573397443277e-06, + "loss": 0.89125746, + "num_input_tokens_seen": 3601715, + "step": 171, + "time_per_iteration": 2.8090479373931885 + }, + { + "auxiliary_loss_clip": 0.0133489, + "auxiliary_loss_mlp": 0.01124353, + "balance_loss_clip": 1.12012601, + "balance_loss_mlp": 1.0262202, + "epoch": 0.020681777189923645, + "flos": 20996610480000.0, + "grad_norm": 2.4457406317013315, + "language_loss": 0.97525728, + "learning_rate": 3.729081526348224e-06, + "loss": 0.99984974, + "num_input_tokens_seen": 3620245, + "step": 172, + "time_per_iteration": 2.760366439819336 + }, + { + "auxiliary_loss_clip": 0.013324, + "auxiliary_loss_mlp": 0.01125728, + "balance_loss_clip": 1.11778927, + "balance_loss_mlp": 1.02749968, + "epoch": 0.020802020080562738, + "flos": 28257532312320.0, + "grad_norm": 1.8183535328971754, + "language_loss": 0.85015017, + "learning_rate": 3.7332812247762777e-06, + "loss": 0.87473148, + "num_input_tokens_seen": 3641545, + "step": 173, + "time_per_iteration": 2.8820250034332275 + }, + { + "auxiliary_loss_clip": 0.01332772, + "auxiliary_loss_mlp": 0.01129981, + "balance_loss_clip": 1.11844862, + "balance_loss_mlp": 1.0314666, + "epoch": 0.020922262971201828, + "flos": 19681274344320.0, + "grad_norm": 3.528125630020664, + "language_loss": 0.95373607, + "learning_rate": 3.737456717315293e-06, + "loss": 0.97836363, + "num_input_tokens_seen": 3660510, + "step": 174, + "time_per_iteration": 2.781439781188965 + }, + { + "auxiliary_loss_clip": 0.01333274, + "auxiliary_loss_mlp": 0.01118566, + "balance_loss_clip": 1.11925578, + "balance_loss_mlp": 1.02238822, + "epoch": 0.021042505861840918, + "flos": 15666353694720.0, + "grad_norm": 2.426175374465451, + "language_loss": 0.90899587, + "learning_rate": 3.7416082813989552e-06, + "loss": 0.93351424, + "num_input_tokens_seen": 3677505, + "step": 175, + "time_per_iteration": 2.7780535221099854 + }, + { + "auxiliary_loss_clip": 0.01330651, + "auxiliary_loss_mlp": 0.0113092, + "balance_loss_clip": 1.11649179, + "balance_loss_mlp": 1.03288209, + "epoch": 0.02116274875248001, + "flos": 21142012734720.0, + "grad_norm": 2.2384434087507983, + "language_loss": 0.89685178, + "learning_rate": 3.745736189718439e-06, + "loss": 0.92146742, + "num_input_tokens_seen": 3696760, + "step": 176, + "time_per_iteration": 2.8263256549835205 + }, + { + "auxiliary_loss_clip": 0.01329251, + "auxiliary_loss_mlp": 0.01128185, + "balance_loss_clip": 1.11598969, + "balance_loss_mlp": 1.03009963, + "epoch": 0.0212829916431191, + "flos": 24715770543360.0, + "grad_norm": 4.3840194090872515, + "language_loss": 0.72617769, + "learning_rate": 3.749840710329894e-06, + "loss": 0.75075203, + "num_input_tokens_seen": 3717465, + "step": 177, + "time_per_iteration": 2.794358968734741 + }, + { + "auxiliary_loss_clip": 0.01331029, + "auxiliary_loss_mlp": 0.0112617, + "balance_loss_clip": 1.1169349, + "balance_loss_mlp": 1.02741742, + "epoch": 0.02140323453375819, + "flos": 16645493508480.0, + "grad_norm": 3.124501975650996, + "language_loss": 0.97686076, + "learning_rate": 3.7539221067588938e-06, + "loss": 1.00143266, + "num_input_tokens_seen": 3731440, + "step": 178, + "time_per_iteration": 2.8395767211914062 + }, + { + "auxiliary_loss_clip": 0.01327448, + "auxiliary_loss_mlp": 0.01129283, + "balance_loss_clip": 1.11403155, + "balance_loss_mlp": 1.03033948, + "epoch": 0.021523477424397284, + "flos": 20299332689280.0, + "grad_norm": 3.7906952624738874, + "language_loss": 0.93934381, + "learning_rate": 3.757980638101964e-06, + "loss": 0.96391118, + "num_input_tokens_seen": 3744935, + "step": 179, + "time_per_iteration": 2.7623281478881836 + }, + { + "auxiliary_loss_clip": 0.01329257, + "auxiliary_loss_mlp": 0.01124886, + "balance_loss_clip": 1.11576688, + "balance_loss_mlp": 1.02732539, + "epoch": 0.021643720315036374, + "flos": 26104005331200.0, + "grad_norm": 4.592231185049649, + "language_loss": 0.89486402, + "learning_rate": 3.7620165591252806e-06, + "loss": 0.91940546, + "num_input_tokens_seen": 3763035, + "step": 180, + "time_per_iteration": 2.8662500381469727 + }, + { + "auxiliary_loss_clip": 0.01329377, + "auxiliary_loss_mlp": 0.01125714, + "balance_loss_clip": 1.11610222, + "balance_loss_mlp": 1.02820051, + "epoch": 0.021763963205675464, + "flos": 24787663614720.0, + "grad_norm": 1.8164574871532895, + "language_loss": 0.94456649, + "learning_rate": 3.766030120360636e-06, + "loss": 0.9691174, + "num_input_tokens_seen": 3782665, + "step": 181, + "time_per_iteration": 2.8351128101348877 + }, + { + "auxiliary_loss_clip": 0.01327692, + "auxiliary_loss_mlp": 0.01125244, + "balance_loss_clip": 1.11420059, + "balance_loss_mlp": 1.02720642, + "epoch": 0.021884206096314557, + "flos": 25813559957760.0, + "grad_norm": 3.2454135662336783, + "language_loss": 0.90262151, + "learning_rate": 3.7700215681987578e-06, + "loss": 0.92715096, + "num_input_tokens_seen": 3802435, + "step": 182, + "time_per_iteration": 2.822972059249878 + }, + { + "auxiliary_loss_clip": 0.01327905, + "auxiliary_loss_mlp": 0.01125686, + "balance_loss_clip": 1.11491537, + "balance_loss_mlp": 1.02783871, + "epoch": 0.022004448986953647, + "flos": 20082719721600.0, + "grad_norm": 1.84079013796925, + "language_loss": 0.82227796, + "learning_rate": 3.7739911449800767e-06, + "loss": 0.84681392, + "num_input_tokens_seen": 3822490, + "step": 183, + "time_per_iteration": 2.7130725383758545 + }, + { + "auxiliary_loss_clip": 0.01325553, + "auxiliary_loss_mlp": 0.01119268, + "balance_loss_clip": 1.11264157, + "balance_loss_mlp": 1.02146924, + "epoch": 0.022124691877592736, + "flos": 20480609652480.0, + "grad_norm": 2.686733306007163, + "language_loss": 0.80589044, + "learning_rate": 3.7779390890830114e-06, + "loss": 0.8303386, + "num_input_tokens_seen": 3841140, + "step": 184, + "time_per_iteration": 2.836822986602783 + }, + { + "auxiliary_loss_clip": 0.01323271, + "auxiliary_loss_mlp": 0.01128998, + "balance_loss_clip": 1.11105275, + "balance_loss_mlp": 1.03181911, + "epoch": 0.02224493476823183, + "flos": 23586847015680.0, + "grad_norm": 2.100206412262428, + "language_loss": 0.85869277, + "learning_rate": 3.7818656350098723e-06, + "loss": 0.88321543, + "num_input_tokens_seen": 3862090, + "step": 185, + "time_per_iteration": 2.8245065212249756 + }, + { + "auxiliary_loss_clip": 0.01324516, + "auxiliary_loss_mlp": 0.01121785, + "balance_loss_clip": 1.11248541, + "balance_loss_mlp": 1.02470112, + "epoch": 0.02236517765887092, + "flos": 16909940413440.0, + "grad_norm": 3.3571285052307505, + "language_loss": 0.77408683, + "learning_rate": 3.7857710134704447e-06, + "loss": 0.79854989, + "num_input_tokens_seen": 3881025, + "step": 186, + "time_per_iteration": 2.809821128845215 + }, + { + "auxiliary_loss_clip": 0.01325743, + "auxiliary_loss_mlp": 0.01119668, + "balance_loss_clip": 1.11357701, + "balance_loss_mlp": 1.02310848, + "epoch": 0.02248542054951001, + "flos": 43508182930560.0, + "grad_norm": 2.2146678256515107, + "language_loss": 0.79426748, + "learning_rate": 3.7896554514633234e-06, + "loss": 0.81872153, + "num_input_tokens_seen": 3905310, + "step": 187, + "time_per_iteration": 2.96390438079834 + }, + { + "auxiliary_loss_clip": 0.01324836, + "auxiliary_loss_mlp": 0.01123049, + "balance_loss_clip": 1.11299312, + "balance_loss_mlp": 1.02563155, + "epoch": 0.022605663440149103, + "flos": 23367648268800.0, + "grad_norm": 1.98611601257085, + "language_loss": 0.84240139, + "learning_rate": 3.7935191723550955e-06, + "loss": 0.8668803, + "num_input_tokens_seen": 3924265, + "step": 188, + "time_per_iteration": 2.7972137928009033 + }, + { + "auxiliary_loss_clip": 0.013228, + "auxiliary_loss_mlp": 0.01121454, + "balance_loss_clip": 1.11142278, + "balance_loss_mlp": 1.02575314, + "epoch": 0.022725906330788193, + "flos": 29019915504000.0, + "grad_norm": 2.0062373967291536, + "language_loss": 0.88646173, + "learning_rate": 3.797362395957408e-06, + "loss": 0.91090429, + "num_input_tokens_seen": 3944830, + "step": 189, + "time_per_iteration": 2.864858865737915 + }, + { + "auxiliary_loss_clip": 0.01325542, + "auxiliary_loss_mlp": 0.01126055, + "balance_loss_clip": 1.11384523, + "balance_loss_mlp": 1.02944791, + "epoch": 0.022846149221427282, + "flos": 24496176746880.0, + "grad_norm": 3.9813427735087434, + "language_loss": 0.78185791, + "learning_rate": 3.8011853386020055e-06, + "loss": 0.80637383, + "num_input_tokens_seen": 3965735, + "step": 190, + "time_per_iteration": 2.759596586227417 + }, + { + "auxiliary_loss_clip": 0.01323553, + "auxiliary_loss_mlp": 0.01120259, + "balance_loss_clip": 1.11199808, + "balance_loss_mlp": 1.02355671, + "epoch": 0.022966392112066376, + "flos": 15523537219200.0, + "grad_norm": 3.2855938894249523, + "language_loss": 0.89719117, + "learning_rate": 3.804988213213804e-06, + "loss": 0.92162925, + "num_input_tokens_seen": 3983975, + "step": 191, + "time_per_iteration": 2.768817901611328 + }, + { + "auxiliary_loss_clip": 0.01360715, + "auxiliary_loss_mlp": 0.01086991, + "balance_loss_clip": 1.15219283, + "balance_loss_mlp": 1.00230432, + "epoch": 0.023086635002705466, + "flos": 55650408433920.0, + "grad_norm": 1.0116129143411363, + "language_loss": 0.63196361, + "learning_rate": 3.808771229382049e-06, + "loss": 0.65644062, + "num_input_tokens_seen": 4043440, + "step": 192, + "time_per_iteration": 3.223794937133789 + }, + { + "auxiliary_loss_clip": 0.0132171, + "auxiliary_loss_mlp": 0.01114446, + "balance_loss_clip": 1.1103425, + "balance_loss_mlp": 1.01822031, + "epoch": 0.023206877893344555, + "flos": 19313441118720.0, + "grad_norm": 1.8880737773981433, + "language_loss": 0.84356892, + "learning_rate": 3.8125345934296324e-06, + "loss": 0.86793053, + "num_input_tokens_seen": 4061750, + "step": 193, + "time_per_iteration": 3.7115602493286133 + }, + { + "auxiliary_loss_clip": 0.01319815, + "auxiliary_loss_mlp": 0.01123587, + "balance_loss_clip": 1.10878539, + "balance_loss_mlp": 1.02621663, + "epoch": 0.02332712078398365, + "flos": 23072965090560.0, + "grad_norm": 2.025078481312149, + "language_loss": 0.87718093, + "learning_rate": 3.81627850848061e-06, + "loss": 0.90161496, + "num_input_tokens_seen": 4082345, + "step": 194, + "time_per_iteration": 4.817065238952637 + }, + { + "auxiliary_loss_clip": 0.01320796, + "auxiliary_loss_mlp": 0.01117369, + "balance_loss_clip": 1.11016691, + "balance_loss_mlp": 1.02147675, + "epoch": 0.02344736367462274, + "flos": 24425971614720.0, + "grad_norm": 2.214450255485101, + "language_loss": 0.86046743, + "learning_rate": 3.820003174525994e-06, + "loss": 0.88484907, + "num_input_tokens_seen": 4101770, + "step": 195, + "time_per_iteration": 2.849453926086426 + }, + { + "auxiliary_loss_clip": 0.0131949, + "auxiliary_loss_mlp": 0.01122389, + "balance_loss_clip": 1.1092627, + "balance_loss_mlp": 1.02597225, + "epoch": 0.02356760656526183, + "flos": 21579799697280.0, + "grad_norm": 3.1780359698818783, + "language_loss": 0.82650602, + "learning_rate": 3.823708788487851e-06, + "loss": 0.85092479, + "num_input_tokens_seen": 4118770, + "step": 196, + "time_per_iteration": 2.7985310554504395 + }, + { + "auxiliary_loss_clip": 0.01321923, + "auxiliary_loss_mlp": 0.01118535, + "balance_loss_clip": 1.11118376, + "balance_loss_mlp": 1.02288115, + "epoch": 0.02368784945590092, + "flos": 25193598192000.0, + "grad_norm": 2.0271590002381488, + "language_loss": 0.84470761, + "learning_rate": 3.827395544281781e-06, + "loss": 0.86911219, + "num_input_tokens_seen": 4141110, + "step": 197, + "time_per_iteration": 2.8323850631713867 + }, + { + "auxiliary_loss_clip": 0.01320385, + "auxiliary_loss_mlp": 0.01122626, + "balance_loss_clip": 1.10998321, + "balance_loss_mlp": 1.02644801, + "epoch": 0.02380809234654001, + "flos": 27562481164800.0, + "grad_norm": 1.9649504165990144, + "language_loss": 0.78820646, + "learning_rate": 3.831063632877802e-06, + "loss": 0.81263655, + "num_input_tokens_seen": 4161430, + "step": 198, + "time_per_iteration": 2.885533332824707 + }, + { + "auxiliary_loss_clip": 0.01323967, + "auxiliary_loss_mlp": 0.01113102, + "balance_loss_clip": 1.11386991, + "balance_loss_mlp": 1.01878333, + "epoch": 0.0239283352371791, + "flos": 18259786540800.0, + "grad_norm": 3.0550885311675886, + "language_loss": 0.75871199, + "learning_rate": 3.834713242359712e-06, + "loss": 0.7830826, + "num_input_tokens_seen": 4179260, + "step": 199, + "time_per_iteration": 2.781116008758545 + }, + { + "auxiliary_loss_clip": 0.01317709, + "auxiliary_loss_mlp": 0.01126923, + "balance_loss_clip": 1.10778475, + "balance_loss_mlp": 1.03060174, + "epoch": 0.02404857812781819, + "flos": 21395110942080.0, + "grad_norm": 3.200018055546233, + "language_loss": 0.87140507, + "learning_rate": 3.838344557982959e-06, + "loss": 0.89585137, + "num_input_tokens_seen": 4200640, + "step": 200, + "time_per_iteration": 2.880269765853882 + }, + { + "auxiliary_loss_clip": 0.0131748, + "auxiliary_loss_mlp": 0.01120294, + "balance_loss_clip": 1.10761046, + "balance_loss_mlp": 1.02454555, + "epoch": 0.024168821018457284, + "flos": 16654256426880.0, + "grad_norm": 4.429228014203863, + "language_loss": 0.84754145, + "learning_rate": 3.841957762231063e-06, + "loss": 0.87191916, + "num_input_tokens_seen": 4218170, + "step": 201, + "time_per_iteration": 2.7489821910858154 + }, + { + "auxiliary_loss_clip": 0.01314954, + "auxiliary_loss_mlp": 0.01110112, + "balance_loss_clip": 1.10529065, + "balance_loss_mlp": 1.01522195, + "epoch": 0.024289063909096374, + "flos": 22820872464000.0, + "grad_norm": 3.047204548469664, + "language_loss": 0.87773967, + "learning_rate": 3.8455530348706454e-06, + "loss": 0.90199029, + "num_input_tokens_seen": 4237770, + "step": 202, + "time_per_iteration": 2.85807466506958 + }, + { + "auxiliary_loss_clip": 0.01317929, + "auxiliary_loss_mlp": 0.01115501, + "balance_loss_clip": 1.10859168, + "balance_loss_mlp": 1.02084923, + "epoch": 0.024409306799735464, + "flos": 17748598135680.0, + "grad_norm": 2.3640182960208085, + "language_loss": 0.77342916, + "learning_rate": 3.849130553005099e-06, + "loss": 0.79776341, + "num_input_tokens_seen": 4255985, + "step": 203, + "time_per_iteration": 2.755375623703003 + }, + { + "auxiliary_loss_clip": 0.01315585, + "auxiliary_loss_mlp": 0.01117675, + "balance_loss_clip": 1.10647011, + "balance_loss_mlp": 1.02283239, + "epoch": 0.024529549690374557, + "flos": 21616213109760.0, + "grad_norm": 1.9770333889156206, + "language_loss": 0.83700776, + "learning_rate": 3.852690491126933e-06, + "loss": 0.86134034, + "num_input_tokens_seen": 4276035, + "step": 204, + "time_per_iteration": 2.8150811195373535 + }, + { + "auxiliary_loss_clip": 0.01315592, + "auxiliary_loss_mlp": 0.01123719, + "balance_loss_clip": 1.1061008, + "balance_loss_mlp": 1.02830362, + "epoch": 0.024649792581013647, + "flos": 25551662918400.0, + "grad_norm": 2.3134805134521224, + "language_loss": 0.90983605, + "learning_rate": 3.856233021168845e-06, + "loss": 0.9342292, + "num_input_tokens_seen": 4295730, + "step": 205, + "time_per_iteration": 2.8747589588165283 + }, + { + "auxiliary_loss_clip": 0.01315803, + "auxiliary_loss_mlp": 0.01118225, + "balance_loss_clip": 1.10696435, + "balance_loss_mlp": 1.02376378, + "epoch": 0.024770035471652737, + "flos": 34495574544000.0, + "grad_norm": 2.7143955363292784, + "language_loss": 0.91424978, + "learning_rate": 3.859758312553544e-06, + "loss": 0.93859005, + "num_input_tokens_seen": 4317950, + "step": 206, + "time_per_iteration": 2.926936388015747 + }, + { + "auxiliary_loss_clip": 0.01316515, + "auxiliary_loss_mlp": 0.01114387, + "balance_loss_clip": 1.10793436, + "balance_loss_mlp": 1.01963985, + "epoch": 0.02489027836229183, + "flos": 21505428587520.0, + "grad_norm": 1.9041045528967306, + "language_loss": 0.91848499, + "learning_rate": 3.8632665322423735e-06, + "loss": 0.94279397, + "num_input_tokens_seen": 4337605, + "step": 207, + "time_per_iteration": 2.9285340309143066 + }, + { + "auxiliary_loss_clip": 0.01313901, + "auxiliary_loss_mlp": 0.01125107, + "balance_loss_clip": 1.10534322, + "balance_loss_mlp": 1.03016925, + "epoch": 0.02501052125293092, + "flos": 23219013790080.0, + "grad_norm": 2.1885845101299584, + "language_loss": 0.85637367, + "learning_rate": 3.866757844782762e-06, + "loss": 0.88076371, + "num_input_tokens_seen": 4358110, + "step": 208, + "time_per_iteration": 2.8557162284851074 + }, + { + "auxiliary_loss_clip": 0.0131639, + "auxiliary_loss_mlp": 0.01115606, + "balance_loss_clip": 1.10780561, + "balance_loss_mlp": 1.02095413, + "epoch": 0.02513076414357001, + "flos": 26388920010240.0, + "grad_norm": 2.2493865080537194, + "language_loss": 0.91549003, + "learning_rate": 3.870232412354527e-06, + "loss": 0.93981004, + "num_input_tokens_seen": 4374955, + "step": 209, + "time_per_iteration": 2.8831043243408203 + }, + { + "auxiliary_loss_clip": 0.0131178, + "auxiliary_loss_mlp": 0.01115248, + "balance_loss_clip": 1.10353732, + "balance_loss_mlp": 1.02059531, + "epoch": 0.025251007034209103, + "flos": 13590430047360.0, + "grad_norm": 2.643610963705673, + "language_loss": 0.92590296, + "learning_rate": 3.873690394815086e-06, + "loss": 0.95017326, + "num_input_tokens_seen": 4391535, + "step": 210, + "time_per_iteration": 2.7723405361175537 + }, + { + "auxiliary_loss_clip": 0.01313625, + "auxiliary_loss_mlp": 0.01116384, + "balance_loss_clip": 1.10553586, + "balance_loss_mlp": 1.02254212, + "epoch": 0.025371249924848193, + "flos": 15049229103360.0, + "grad_norm": 2.449211864554208, + "language_loss": 0.9133479, + "learning_rate": 3.877131949743587e-06, + "loss": 0.93764794, + "num_input_tokens_seen": 4408400, + "step": 211, + "time_per_iteration": 2.8052175045013428 + }, + { + "auxiliary_loss_clip": 0.01314837, + "auxiliary_loss_mlp": 0.01118416, + "balance_loss_clip": 1.10670829, + "balance_loss_mlp": 1.02314401, + "epoch": 0.025491492815487283, + "flos": 25553853648000.0, + "grad_norm": 2.5556973798363116, + "language_loss": 0.7800141, + "learning_rate": 3.880557232483993e-06, + "loss": 0.80434668, + "num_input_tokens_seen": 4427840, + "step": 212, + "time_per_iteration": 2.865285634994507 + }, + { + "auxiliary_loss_clip": 0.01311265, + "auxiliary_loss_mlp": 0.01117719, + "balance_loss_clip": 1.10350394, + "balance_loss_mlp": 1.02321029, + "epoch": 0.025611735706126376, + "flos": 20630752502400.0, + "grad_norm": 2.914112043645268, + "language_loss": 0.86777902, + "learning_rate": 3.883966396187164e-06, + "loss": 0.89206886, + "num_input_tokens_seen": 4447110, + "step": 213, + "time_per_iteration": 2.8421642780303955 + }, + { + "auxiliary_loss_clip": 0.01312107, + "auxiliary_loss_mlp": 0.01114315, + "balance_loss_clip": 1.10495901, + "balance_loss_mlp": 1.020998, + "epoch": 0.025731978596765466, + "flos": 19062282245760.0, + "grad_norm": 2.3685967386584923, + "language_loss": 0.90115619, + "learning_rate": 3.887359591851937e-06, + "loss": 0.9254204, + "num_input_tokens_seen": 4464715, + "step": 214, + "time_per_iteration": 2.846015214920044 + }, + { + "auxiliary_loss_clip": 0.01313383, + "auxiliary_loss_mlp": 0.01115707, + "balance_loss_clip": 1.10635781, + "balance_loss_mlp": 1.02248526, + "epoch": 0.025852221487404556, + "flos": 22163814927360.0, + "grad_norm": 1.6523534930592425, + "language_loss": 0.92225134, + "learning_rate": 3.890736968365265e-06, + "loss": 0.94654226, + "num_input_tokens_seen": 4485030, + "step": 215, + "time_per_iteration": 2.8668465614318848 + }, + { + "auxiliary_loss_clip": 0.0131226, + "auxiliary_loss_mlp": 0.01114995, + "balance_loss_clip": 1.10524261, + "balance_loss_mlp": 1.02091479, + "epoch": 0.02597246437804365, + "flos": 26541971861760.0, + "grad_norm": 1.8523890496112163, + "language_loss": 0.8511771, + "learning_rate": 3.894098672541412e-06, + "loss": 0.87544966, + "num_input_tokens_seen": 4505935, + "step": 216, + "time_per_iteration": 2.9087066650390625 + }, + { + "auxiliary_loss_clip": 0.01310673, + "auxiliary_loss_mlp": 0.01116755, + "balance_loss_clip": 1.1039772, + "balance_loss_mlp": 1.02296114, + "epoch": 0.02609270726868274, + "flos": 32671671696000.0, + "grad_norm": 1.9031054145800994, + "language_loss": 0.7537998, + "learning_rate": 3.89744484916025e-06, + "loss": 0.77807403, + "num_input_tokens_seen": 4527045, + "step": 217, + "time_per_iteration": 2.912503957748413 + }, + { + "auxiliary_loss_clip": 0.01310953, + "auxiliary_loss_mlp": 0.01117711, + "balance_loss_clip": 1.10359049, + "balance_loss_mlp": 1.02320218, + "epoch": 0.02621295015932183, + "flos": 26243553669120.0, + "grad_norm": 2.1504787551542215, + "language_loss": 0.87039196, + "learning_rate": 3.900775641004673e-06, + "loss": 0.89467871, + "num_input_tokens_seen": 4546360, + "step": 218, + "time_per_iteration": 2.8239779472351074 + }, + { + "auxiliary_loss_clip": 0.01311596, + "auxiliary_loss_mlp": 0.01114946, + "balance_loss_clip": 1.10477757, + "balance_loss_mlp": 1.01996028, + "epoch": 0.026333193049960922, + "flos": 42921402353280.0, + "grad_norm": 2.554070887090334, + "language_loss": 0.73971713, + "learning_rate": 3.904091188897156e-06, + "loss": 0.76398253, + "num_input_tokens_seen": 4565495, + "step": 219, + "time_per_iteration": 4.0887451171875 + }, + { + "auxiliary_loss_clip": 0.01311259, + "auxiliary_loss_mlp": 0.01117733, + "balance_loss_clip": 1.10527587, + "balance_loss_mlp": 1.02422512, + "epoch": 0.026453435940600012, + "flos": 17963846386560.0, + "grad_norm": 2.093980102043644, + "language_loss": 0.821841, + "learning_rate": 3.90739163173548e-06, + "loss": 0.84613097, + "num_input_tokens_seen": 4583330, + "step": 220, + "time_per_iteration": 4.618932247161865 + }, + { + "auxiliary_loss_clip": 0.01309083, + "auxiliary_loss_mlp": 0.0111193, + "balance_loss_clip": 1.10343838, + "balance_loss_mlp": 1.01847041, + "epoch": 0.026573678831239102, + "flos": 18984319776000.0, + "grad_norm": 2.283974070831139, + "language_loss": 0.88449109, + "learning_rate": 3.910677106527646e-06, + "loss": 0.90870118, + "num_input_tokens_seen": 4600520, + "step": 221, + "time_per_iteration": 2.8274197578430176 + }, + { + "auxiliary_loss_clip": 0.01308841, + "auxiliary_loss_mlp": 0.01115617, + "balance_loss_clip": 1.10321009, + "balance_loss_mlp": 1.02239537, + "epoch": 0.026693921721878195, + "flos": 29241448634880.0, + "grad_norm": 2.4312160056198278, + "language_loss": 0.84247792, + "learning_rate": 3.913947748426004e-06, + "loss": 0.86672246, + "num_input_tokens_seen": 4617340, + "step": 222, + "time_per_iteration": 2.9593331813812256 + }, + { + "auxiliary_loss_clip": 0.01309479, + "auxiliary_loss_mlp": 0.01115545, + "balance_loss_clip": 1.10406709, + "balance_loss_mlp": 1.0217514, + "epoch": 0.026814164612517285, + "flos": 14128083797760.0, + "grad_norm": 2.478415654407726, + "language_loss": 0.76732576, + "learning_rate": 3.9172036907606136e-06, + "loss": 0.79157591, + "num_input_tokens_seen": 4630820, + "step": 223, + "time_per_iteration": 2.7118632793426514 + }, + { + "auxiliary_loss_clip": 0.01306401, + "auxiliary_loss_mlp": 0.01120799, + "balance_loss_clip": 1.10050201, + "balance_loss_mlp": 1.02624226, + "epoch": 0.026934407503156375, + "flos": 23511973115520.0, + "grad_norm": 1.9427902210653287, + "language_loss": 0.95074999, + "learning_rate": 3.920445065071855e-06, + "loss": 0.97502202, + "num_input_tokens_seen": 4651985, + "step": 224, + "time_per_iteration": 2.981510639190674 + }, + { + "auxiliary_loss_clip": 0.0130787, + "auxiliary_loss_mlp": 0.01115226, + "balance_loss_clip": 1.10246468, + "balance_loss_mlp": 1.02162337, + "epoch": 0.027054650393795468, + "flos": 28950356816640.0, + "grad_norm": 2.4167994838356894, + "language_loss": 0.79826748, + "learning_rate": 3.923672001142322e-06, + "loss": 0.82249844, + "num_input_tokens_seen": 4672295, + "step": 225, + "time_per_iteration": 2.9032223224639893 + }, + { + "auxiliary_loss_clip": 0.01305974, + "auxiliary_loss_mlp": 0.01117018, + "balance_loss_clip": 1.10078776, + "balance_loss_mlp": 1.02279508, + "epoch": 0.027174893284434558, + "flos": 31431568596480.0, + "grad_norm": 2.3216985399627585, + "language_loss": 0.84567732, + "learning_rate": 3.926884627027996e-06, + "loss": 0.86990726, + "num_input_tokens_seen": 4696065, + "step": 226, + "time_per_iteration": 2.9400370121002197 + }, + { + "auxiliary_loss_clip": 0.01305947, + "auxiliary_loss_mlp": 0.01113712, + "balance_loss_clip": 1.10107076, + "balance_loss_mlp": 1.02044308, + "epoch": 0.027295136175073648, + "flos": 22054466949120.0, + "grad_norm": 1.9303273248209791, + "language_loss": 0.77505887, + "learning_rate": 3.930083069088744e-06, + "loss": 0.79925543, + "num_input_tokens_seen": 4716065, + "step": 227, + "time_per_iteration": 2.963283061981201 + }, + { + "auxiliary_loss_clip": 0.01343304, + "auxiliary_loss_mlp": 0.0108394, + "balance_loss_clip": 1.14090562, + "balance_loss_mlp": 1.00001621, + "epoch": 0.02741537906571274, + "flos": 60800752972800.0, + "grad_norm": 0.9941786848804383, + "language_loss": 0.59298617, + "learning_rate": 3.933267452018137e-06, + "loss": 0.61725861, + "num_input_tokens_seen": 4775860, + "step": 228, + "time_per_iteration": 3.315208673477173 + }, + { + "auxiliary_loss_clip": 0.0130834, + "auxiliary_loss_mlp": 0.01112787, + "balance_loss_clip": 1.10370135, + "balance_loss_mlp": 1.01956511, + "epoch": 0.02753562195635183, + "flos": 24606278910720.0, + "grad_norm": 1.9057578094664076, + "language_loss": 0.8402077, + "learning_rate": 3.936437898872622e-06, + "loss": 0.86441904, + "num_input_tokens_seen": 4795835, + "step": 229, + "time_per_iteration": 2.91603422164917 + }, + { + "auxiliary_loss_clip": 0.0130734, + "auxiliary_loss_mlp": 0.01116018, + "balance_loss_clip": 1.10240197, + "balance_loss_mlp": 1.02198613, + "epoch": 0.02765586484699092, + "flos": 34094236907520.0, + "grad_norm": 2.748593750144763, + "language_loss": 0.7954371, + "learning_rate": 3.9395945311000525e-06, + "loss": 0.81967074, + "num_input_tokens_seen": 4817460, + "step": 230, + "time_per_iteration": 2.9332573413848877 + }, + { + "auxiliary_loss_clip": 0.01304514, + "auxiliary_loss_mlp": 0.01118468, + "balance_loss_clip": 1.10022044, + "balance_loss_mlp": 1.02462602, + "epoch": 0.027776107737630014, + "flos": 14829922615680.0, + "grad_norm": 2.0346793041180025, + "language_loss": 0.91016471, + "learning_rate": 3.942737468567608e-06, + "loss": 0.93439448, + "num_input_tokens_seen": 4835475, + "step": 231, + "time_per_iteration": 2.7808361053466797 + }, + { + "auxiliary_loss_clip": 0.01306298, + "auxiliary_loss_mlp": 0.01114582, + "balance_loss_clip": 1.10264325, + "balance_loss_mlp": 1.02126527, + "epoch": 0.027896350628269104, + "flos": 47920347066240.0, + "grad_norm": 4.68055323739165, + "language_loss": 0.86101174, + "learning_rate": 3.9458668295891026e-06, + "loss": 0.88522053, + "num_input_tokens_seen": 4857760, + "step": 232, + "time_per_iteration": 3.0668301582336426 + }, + { + "auxiliary_loss_clip": 0.01302914, + "auxiliary_loss_mlp": 0.01110907, + "balance_loss_clip": 1.09935069, + "balance_loss_mlp": 1.01763749, + "epoch": 0.028016593518908194, + "flos": 21684550734720.0, + "grad_norm": 2.340373988867155, + "language_loss": 0.86811054, + "learning_rate": 3.948982730951712e-06, + "loss": 0.89224869, + "num_input_tokens_seen": 4875855, + "step": 233, + "time_per_iteration": 2.7481627464294434 + }, + { + "auxiliary_loss_clip": 0.01308315, + "auxiliary_loss_mlp": 0.01114095, + "balance_loss_clip": 1.10398567, + "balance_loss_mlp": 1.02082515, + "epoch": 0.028136836409547287, + "flos": 18439483305600.0, + "grad_norm": 2.9030860121364928, + "language_loss": 0.82221454, + "learning_rate": 3.9520852879421254e-06, + "loss": 0.84643865, + "num_input_tokens_seen": 4893200, + "step": 234, + "time_per_iteration": 2.8936831951141357 + }, + { + "auxiliary_loss_clip": 0.01304477, + "auxiliary_loss_mlp": 0.01113561, + "balance_loss_clip": 1.10153842, + "balance_loss_mlp": 1.02062583, + "epoch": 0.028257079300186377, + "flos": 31576934937600.0, + "grad_norm": 3.2127162882853746, + "language_loss": 0.81514645, + "learning_rate": 3.955174614372137e-06, + "loss": 0.8393268, + "num_input_tokens_seen": 4912965, + "step": 235, + "time_per_iteration": 2.919975757598877 + }, + { + "auxiliary_loss_clip": 0.01303597, + "auxiliary_loss_mlp": 0.0112031, + "balance_loss_clip": 1.10085297, + "balance_loss_mlp": 1.02665877, + "epoch": 0.028377322190825467, + "flos": 23513337832320.0, + "grad_norm": 2.14335665160743, + "language_loss": 0.84269863, + "learning_rate": 3.9582508226037045e-06, + "loss": 0.86693776, + "num_input_tokens_seen": 4933105, + "step": 236, + "time_per_iteration": 2.8211510181427 + }, + { + "auxiliary_loss_clip": 0.01302111, + "auxiliary_loss_mlp": 0.01115289, + "balance_loss_clip": 1.09917796, + "balance_loss_mlp": 1.02125692, + "epoch": 0.02849756508146456, + "flos": 20479604071680.0, + "grad_norm": 2.7685157587910365, + "language_loss": 0.94004202, + "learning_rate": 3.9613140235734636e-06, + "loss": 0.96421605, + "num_input_tokens_seen": 4950085, + "step": 237, + "time_per_iteration": 2.909450054168701 + }, + { + "auxiliary_loss_clip": 0.0130588, + "auxiliary_loss_mlp": 0.01114376, + "balance_loss_clip": 1.10322094, + "balance_loss_mlp": 1.02129781, + "epoch": 0.02861780797210365, + "flos": 14283362292480.0, + "grad_norm": 2.0819843178198023, + "language_loss": 0.81199503, + "learning_rate": 3.96436432681674e-06, + "loss": 0.83619761, + "num_input_tokens_seen": 4968075, + "step": 238, + "time_per_iteration": 2.795448064804077 + }, + { + "auxiliary_loss_clip": 0.01304336, + "auxiliary_loss_mlp": 0.01114155, + "balance_loss_clip": 1.10182369, + "balance_loss_mlp": 1.02102876, + "epoch": 0.02873805086274274, + "flos": 25808532053760.0, + "grad_norm": 2.135641318966207, + "language_loss": 0.89275217, + "learning_rate": 3.967401840491044e-06, + "loss": 0.91693699, + "num_input_tokens_seen": 4987355, + "step": 239, + "time_per_iteration": 2.8982675075531006 + }, + { + "auxiliary_loss_clip": 0.01302965, + "auxiliary_loss_mlp": 0.01116053, + "balance_loss_clip": 1.10077107, + "balance_loss_mlp": 1.02283108, + "epoch": 0.028858293753381833, + "flos": 17304238984320.0, + "grad_norm": 2.270146238185787, + "language_loss": 0.8791945, + "learning_rate": 3.97042667139909e-06, + "loss": 0.90338463, + "num_input_tokens_seen": 5004680, + "step": 240, + "time_per_iteration": 2.9431869983673096 + }, + { + "auxiliary_loss_clip": 0.01302727, + "auxiliary_loss_mlp": 0.01112488, + "balance_loss_clip": 1.10044241, + "balance_loss_mlp": 1.01936162, + "epoch": 0.028978536644020923, + "flos": 23038347358080.0, + "grad_norm": 1.9874970676619628, + "language_loss": 0.87573487, + "learning_rate": 3.973438925011327e-06, + "loss": 0.89988697, + "num_input_tokens_seen": 5022965, + "step": 241, + "time_per_iteration": 2.7997875213623047 + }, + { + "auxiliary_loss_clip": 0.01300781, + "auxiliary_loss_mlp": 0.01111679, + "balance_loss_clip": 1.09879017, + "balance_loss_mlp": 1.01917255, + "epoch": 0.029098779534660012, + "flos": 28329712692480.0, + "grad_norm": 2.6289033046515864, + "language_loss": 0.91448462, + "learning_rate": 3.976438705488002e-06, + "loss": 0.93860918, + "num_input_tokens_seen": 5042625, + "step": 242, + "time_per_iteration": 2.904426097869873 + }, + { + "auxiliary_loss_clip": 0.0130244, + "auxiliary_loss_mlp": 0.0111241, + "balance_loss_clip": 1.10098052, + "balance_loss_mlp": 1.01985645, + "epoch": 0.029219022425299106, + "flos": 13881665520000.0, + "grad_norm": 2.7735940651644357, + "language_loss": 0.9309023, + "learning_rate": 3.9794261157007744e-06, + "loss": 0.95505083, + "num_input_tokens_seen": 5060380, + "step": 243, + "time_per_iteration": 2.829406976699829 + }, + { + "auxiliary_loss_clip": 0.01302039, + "auxiliary_loss_mlp": 0.01114225, + "balance_loss_clip": 1.10063481, + "balance_loss_mlp": 1.02176666, + "epoch": 0.029339265315938196, + "flos": 19422501788160.0, + "grad_norm": 2.6722829934147714, + "language_loss": 0.84662038, + "learning_rate": 3.982401257253887e-06, + "loss": 0.87078297, + "num_input_tokens_seen": 5078720, + "step": 244, + "time_per_iteration": 2.7979321479797363 + }, + { + "auxiliary_loss_clip": 0.01300322, + "auxiliary_loss_mlp": 0.01110219, + "balance_loss_clip": 1.09907734, + "balance_loss_mlp": 1.01795077, + "epoch": 0.029459508206577285, + "flos": 15669550005120.0, + "grad_norm": 2.0402032701169692, + "language_loss": 0.89792186, + "learning_rate": 3.985364230504893e-06, + "loss": 0.92202723, + "num_input_tokens_seen": 5096605, + "step": 245, + "time_per_iteration": 5.586026906967163 + }, + { + "auxiliary_loss_clip": 0.01301725, + "auxiliary_loss_mlp": 0.01115258, + "balance_loss_clip": 1.09983468, + "balance_loss_mlp": 1.02232265, + "epoch": 0.02957975109721638, + "flos": 28220975245440.0, + "grad_norm": 2.074679647874908, + "language_loss": 0.8420521, + "learning_rate": 3.988315134584976e-06, + "loss": 0.8662219, + "num_input_tokens_seen": 5116285, + "step": 246, + "time_per_iteration": 3.6916189193725586 + }, + { + "auxiliary_loss_clip": 0.01300774, + "auxiliary_loss_mlp": 0.01113134, + "balance_loss_clip": 1.09963155, + "balance_loss_mlp": 1.0199604, + "epoch": 0.02969999398785547, + "flos": 24315869450880.0, + "grad_norm": 2.003243640769942, + "language_loss": 0.80690908, + "learning_rate": 3.991254067418851e-06, + "loss": 0.83104813, + "num_input_tokens_seen": 5136825, + "step": 247, + "time_per_iteration": 2.801327705383301 + }, + { + "auxiliary_loss_clip": 0.01300876, + "auxiliary_loss_mlp": 0.01112952, + "balance_loss_clip": 1.10019302, + "balance_loss_mlp": 1.02096999, + "epoch": 0.02982023687849456, + "flos": 35078584193280.0, + "grad_norm": 2.0081458813783573, + "language_loss": 0.82880914, + "learning_rate": 3.994181125744254e-06, + "loss": 0.85294741, + "num_input_tokens_seen": 5158630, + "step": 248, + "time_per_iteration": 2.9139249324798584 + }, + { + "auxiliary_loss_clip": 0.01298485, + "auxiliary_loss_mlp": 0.01113296, + "balance_loss_clip": 1.097893, + "balance_loss_mlp": 1.02131414, + "epoch": 0.02994047976913365, + "flos": 26177155378560.0, + "grad_norm": 2.1001972664878426, + "language_loss": 0.73954016, + "learning_rate": 3.99709640513106e-06, + "loss": 0.76365793, + "num_input_tokens_seen": 5179510, + "step": 249, + "time_per_iteration": 2.7981364727020264 + }, + { + "auxiliary_loss_clip": 0.01299178, + "auxiliary_loss_mlp": 0.01113971, + "balance_loss_clip": 1.09869766, + "balance_loss_mlp": 1.02098775, + "epoch": 0.03006072265977274, + "flos": 25625028447360.0, + "grad_norm": 2.114414744927398, + "language_loss": 0.85559851, + "learning_rate": 4e-06, + "loss": 0.87972999, + "num_input_tokens_seen": 5199345, + "step": 250, + "time_per_iteration": 2.8095545768737793 + }, + { + "auxiliary_loss_clip": 0.01299431, + "auxiliary_loss_mlp": 0.01116801, + "balance_loss_clip": 1.09879506, + "balance_loss_mlp": 1.02500963, + "epoch": 0.03018096555041183, + "flos": 22127078292480.0, + "grad_norm": 2.9437992937800543, + "language_loss": 0.88578916, + "learning_rate": 3.999999848300794e-06, + "loss": 0.90995145, + "num_input_tokens_seen": 5218330, + "step": 251, + "time_per_iteration": 2.739192485809326 + }, + { + "auxiliary_loss_clip": 0.01294831, + "auxiliary_loss_mlp": 0.01110607, + "balance_loss_clip": 1.09480429, + "balance_loss_mlp": 1.01905441, + "epoch": 0.030301208441050925, + "flos": 30188197359360.0, + "grad_norm": 1.597425738134044, + "language_loss": 0.89293683, + "learning_rate": 3.999999393203203e-06, + "loss": 0.91699117, + "num_input_tokens_seen": 5240740, + "step": 252, + "time_per_iteration": 2.8481545448303223 + }, + { + "auxiliary_loss_clip": 0.01295762, + "auxiliary_loss_mlp": 0.01111829, + "balance_loss_clip": 1.09621263, + "balance_loss_mlp": 1.01998985, + "epoch": 0.030421451331690014, + "flos": 23621392920960.0, + "grad_norm": 1.9595919444344054, + "language_loss": 0.85006607, + "learning_rate": 3.999998634707293e-06, + "loss": 0.87414205, + "num_input_tokens_seen": 5260290, + "step": 253, + "time_per_iteration": 2.708956241607666 + }, + { + "auxiliary_loss_clip": 0.01301582, + "auxiliary_loss_mlp": 0.01117163, + "balance_loss_clip": 1.10193753, + "balance_loss_mlp": 1.02537203, + "epoch": 0.030541694222329104, + "flos": 27928446883200.0, + "grad_norm": 2.7281504624060684, + "language_loss": 0.96578872, + "learning_rate": 3.999997572813182e-06, + "loss": 0.98997617, + "num_input_tokens_seen": 5278100, + "step": 254, + "time_per_iteration": 2.741961717605591 + }, + { + "auxiliary_loss_clip": 0.01296738, + "auxiliary_loss_mlp": 0.01110219, + "balance_loss_clip": 1.09764647, + "balance_loss_mlp": 1.01842797, + "epoch": 0.030661937112968194, + "flos": 18588441006720.0, + "grad_norm": 1.908831333482971, + "language_loss": 0.87828791, + "learning_rate": 3.999996207521028e-06, + "loss": 0.90235746, + "num_input_tokens_seen": 5296810, + "step": 255, + "time_per_iteration": 2.721982479095459 + }, + { + "auxiliary_loss_clip": 0.01297173, + "auxiliary_loss_mlp": 0.01110425, + "balance_loss_clip": 1.09783506, + "balance_loss_mlp": 1.0186336, + "epoch": 0.030782180003607287, + "flos": 12969139478400.0, + "grad_norm": 2.6461093756465, + "language_loss": 0.82327777, + "learning_rate": 3.999994538831039e-06, + "loss": 0.84735376, + "num_input_tokens_seen": 5313395, + "step": 256, + "time_per_iteration": 2.6670875549316406 + }, + { + "auxiliary_loss_clip": 0.01294465, + "auxiliary_loss_mlp": 0.01106828, + "balance_loss_clip": 1.0960331, + "balance_loss_mlp": 1.01508427, + "epoch": 0.030902422894246377, + "flos": 23335364920320.0, + "grad_norm": 2.3506664856162374, + "language_loss": 0.85792911, + "learning_rate": 3.99999256674347e-06, + "loss": 0.88194203, + "num_input_tokens_seen": 5333545, + "step": 257, + "time_per_iteration": 2.794759511947632 + }, + { + "auxiliary_loss_clip": 0.01328645, + "auxiliary_loss_mlp": 0.010837, + "balance_loss_clip": 1.13293672, + "balance_loss_mlp": 1.00015819, + "epoch": 0.031022665784885467, + "flos": 55094151438720.0, + "grad_norm": 1.0147157414853287, + "language_loss": 0.53501052, + "learning_rate": 3.999990291258618e-06, + "loss": 0.55913401, + "num_input_tokens_seen": 5392235, + "step": 258, + "time_per_iteration": 3.2128281593322754 + }, + { + "auxiliary_loss_clip": 0.01297875, + "auxiliary_loss_mlp": 0.01114374, + "balance_loss_clip": 1.0999279, + "balance_loss_mlp": 1.02258265, + "epoch": 0.03114290867552456, + "flos": 19317786664320.0, + "grad_norm": 2.1436632269278437, + "language_loss": 0.86581767, + "learning_rate": 3.999987712376829e-06, + "loss": 0.88994008, + "num_input_tokens_seen": 5410555, + "step": 259, + "time_per_iteration": 2.7150096893310547 + }, + { + "auxiliary_loss_clip": 0.01297767, + "auxiliary_loss_mlp": 0.01112718, + "balance_loss_clip": 1.09994411, + "balance_loss_mlp": 1.02107024, + "epoch": 0.031263151566163654, + "flos": 20959442881920.0, + "grad_norm": 2.1385525021162644, + "language_loss": 0.82271928, + "learning_rate": 3.999984830098494e-06, + "loss": 0.84682411, + "num_input_tokens_seen": 5430135, + "step": 260, + "time_per_iteration": 2.7545993328094482 + }, + { + "auxiliary_loss_clip": 0.01294783, + "auxiliary_loss_mlp": 0.01110077, + "balance_loss_clip": 1.09709096, + "balance_loss_mlp": 1.01828623, + "epoch": 0.03138339445680274, + "flos": 14793006412800.0, + "grad_norm": 3.106179126108485, + "language_loss": 0.98063242, + "learning_rate": 3.999981644424051e-06, + "loss": 1.00468111, + "num_input_tokens_seen": 5444935, + "step": 261, + "time_per_iteration": 2.7613000869750977 + }, + { + "auxiliary_loss_clip": 0.01295652, + "auxiliary_loss_mlp": 0.01112455, + "balance_loss_clip": 1.0979383, + "balance_loss_mlp": 1.0209496, + "epoch": 0.03150363734744183, + "flos": 11655599022720.0, + "grad_norm": 2.6600575704773624, + "language_loss": 0.86142963, + "learning_rate": 3.999978155353982e-06, + "loss": 0.88551068, + "num_input_tokens_seen": 5462080, + "step": 262, + "time_per_iteration": 2.62861967086792 + }, + { + "auxiliary_loss_clip": 0.01292147, + "auxiliary_loss_mlp": 0.01107804, + "balance_loss_clip": 1.09464741, + "balance_loss_mlp": 1.01625085, + "epoch": 0.03162388023808092, + "flos": 33727732485120.0, + "grad_norm": 4.72987163497874, + "language_loss": 0.80243993, + "learning_rate": 3.9999743628888186e-06, + "loss": 0.8264395, + "num_input_tokens_seen": 5483870, + "step": 263, + "time_per_iteration": 2.814974546432495 + }, + { + "auxiliary_loss_clip": 0.01292953, + "auxiliary_loss_mlp": 0.01108321, + "balance_loss_clip": 1.09615183, + "balance_loss_mlp": 1.01705456, + "epoch": 0.03174412312872001, + "flos": 20810952057600.0, + "grad_norm": 2.584565544243143, + "language_loss": 0.89681131, + "learning_rate": 3.999970267029133e-06, + "loss": 0.92082405, + "num_input_tokens_seen": 5502830, + "step": 264, + "time_per_iteration": 2.740891695022583 + }, + { + "auxiliary_loss_clip": 0.01293985, + "auxiliary_loss_mlp": 0.01108305, + "balance_loss_clip": 1.09711242, + "balance_loss_mlp": 1.0169909, + "epoch": 0.0318643660193591, + "flos": 23727939638400.0, + "grad_norm": 2.004247360534677, + "language_loss": 0.80033529, + "learning_rate": 3.999965867775548e-06, + "loss": 0.82435822, + "num_input_tokens_seen": 5523225, + "step": 265, + "time_per_iteration": 2.737525701522827 + }, + { + "auxiliary_loss_clip": 0.0129113, + "auxiliary_loss_mlp": 0.0110876, + "balance_loss_clip": 1.0936861, + "balance_loss_mlp": 1.01682615, + "epoch": 0.0319846089099982, + "flos": 13917863450880.0, + "grad_norm": 2.7899322975026566, + "language_loss": 0.86992645, + "learning_rate": 3.9999611651287315e-06, + "loss": 0.89392531, + "num_input_tokens_seen": 5541380, + "step": 266, + "time_per_iteration": 2.705693006515503 + }, + { + "auxiliary_loss_clip": 0.01290942, + "auxiliary_loss_mlp": 0.01114981, + "balance_loss_clip": 1.09472167, + "balance_loss_mlp": 1.02252245, + "epoch": 0.03210485180063729, + "flos": 14753253035520.0, + "grad_norm": 2.662146121571784, + "language_loss": 0.78456962, + "learning_rate": 3.999956159089396e-06, + "loss": 0.80862892, + "num_input_tokens_seen": 5558830, + "step": 267, + "time_per_iteration": 2.6728274822235107 + }, + { + "auxiliary_loss_clip": 0.01295173, + "auxiliary_loss_mlp": 0.0110807, + "balance_loss_clip": 1.0987606, + "balance_loss_mlp": 1.01694608, + "epoch": 0.03222509469127638, + "flos": 28913153304960.0, + "grad_norm": 2.7012082538798454, + "language_loss": 0.79406452, + "learning_rate": 3.999950849658302e-06, + "loss": 0.81809694, + "num_input_tokens_seen": 5577750, + "step": 268, + "time_per_iteration": 2.7807610034942627 + }, + { + "auxiliary_loss_clip": 0.01289371, + "auxiliary_loss_mlp": 0.01112264, + "balance_loss_clip": 1.09366822, + "balance_loss_mlp": 1.0211401, + "epoch": 0.03234533758191547, + "flos": 16946389739520.0, + "grad_norm": 2.8212885866554656, + "language_loss": 0.83881211, + "learning_rate": 3.999945236836254e-06, + "loss": 0.86282849, + "num_input_tokens_seen": 5596715, + "step": 269, + "time_per_iteration": 2.720799684524536 + }, + { + "auxiliary_loss_clip": 0.01292558, + "auxiliary_loss_mlp": 0.01109763, + "balance_loss_clip": 1.09698939, + "balance_loss_mlp": 1.01816225, + "epoch": 0.03246558047255456, + "flos": 18989096284800.0, + "grad_norm": 3.0603232006648886, + "language_loss": 0.94746405, + "learning_rate": 3.999939320624103e-06, + "loss": 0.97148716, + "num_input_tokens_seen": 5611865, + "step": 270, + "time_per_iteration": 2.7671520709991455 + }, + { + "auxiliary_loss_clip": 0.01292096, + "auxiliary_loss_mlp": 0.01113935, + "balance_loss_clip": 1.09661007, + "balance_loss_mlp": 1.02185822, + "epoch": 0.03258582336319365, + "flos": 23728334688000.0, + "grad_norm": 2.2030563096204627, + "language_loss": 0.89961112, + "learning_rate": 3.999933101022749e-06, + "loss": 0.92367148, + "num_input_tokens_seen": 5632270, + "step": 271, + "time_per_iteration": 5.544266700744629 + }, + { + "auxiliary_loss_clip": 0.01290579, + "auxiliary_loss_mlp": 0.01110525, + "balance_loss_clip": 1.09562945, + "balance_loss_mlp": 1.01930571, + "epoch": 0.032706066253832745, + "flos": 27670823562240.0, + "grad_norm": 1.8684245703535434, + "language_loss": 0.86664319, + "learning_rate": 3.999926578033132e-06, + "loss": 0.89065427, + "num_input_tokens_seen": 5652085, + "step": 272, + "time_per_iteration": 2.7684710025787354 + }, + { + "auxiliary_loss_clip": 0.01288228, + "auxiliary_loss_mlp": 0.01109466, + "balance_loss_clip": 1.09296203, + "balance_loss_mlp": 1.01767504, + "epoch": 0.032826309144471835, + "flos": 45624685968000.0, + "grad_norm": 1.7858775170928554, + "language_loss": 0.62923455, + "learning_rate": 3.999919751656244e-06, + "loss": 0.65321147, + "num_input_tokens_seen": 5678985, + "step": 273, + "time_per_iteration": 2.954774856567383 + }, + { + "auxiliary_loss_clip": 0.01289875, + "auxiliary_loss_mlp": 0.01111436, + "balance_loss_clip": 1.09458995, + "balance_loss_mlp": 1.02040768, + "epoch": 0.032946552035110925, + "flos": 25812374808960.0, + "grad_norm": 2.3670443740854634, + "language_loss": 0.75725877, + "learning_rate": 3.9999126218931195e-06, + "loss": 0.78127187, + "num_input_tokens_seen": 5697020, + "step": 274, + "time_per_iteration": 2.8238468170166016 + }, + { + "auxiliary_loss_clip": 0.01291797, + "auxiliary_loss_mlp": 0.01114715, + "balance_loss_clip": 1.09703088, + "balance_loss_mlp": 1.02321029, + "epoch": 0.033066794925750015, + "flos": 15121984101120.0, + "grad_norm": 2.372115500237314, + "language_loss": 0.89469367, + "learning_rate": 3.99990518874484e-06, + "loss": 0.91875881, + "num_input_tokens_seen": 5713460, + "step": 275, + "time_per_iteration": 2.693390130996704 + }, + { + "auxiliary_loss_clip": 0.01292478, + "auxiliary_loss_mlp": 0.01109337, + "balance_loss_clip": 1.09823394, + "balance_loss_mlp": 1.01826072, + "epoch": 0.033187037816389105, + "flos": 22776593973120.0, + "grad_norm": 2.474185738960017, + "language_loss": 0.92754287, + "learning_rate": 3.999897452212534e-06, + "loss": 0.95156097, + "num_input_tokens_seen": 5730790, + "step": 276, + "time_per_iteration": 2.7763023376464844 + }, + { + "auxiliary_loss_clip": 0.01287713, + "auxiliary_loss_mlp": 0.01108757, + "balance_loss_clip": 1.09364462, + "balance_loss_mlp": 1.01753759, + "epoch": 0.033307280707028195, + "flos": 23331414424320.0, + "grad_norm": 2.4368031885042503, + "language_loss": 0.99986577, + "learning_rate": 3.999889412297374e-06, + "loss": 1.02383041, + "num_input_tokens_seen": 5750215, + "step": 277, + "time_per_iteration": 2.7917308807373047 + }, + { + "auxiliary_loss_clip": 0.0128824, + "auxiliary_loss_mlp": 0.01111769, + "balance_loss_clip": 1.09383476, + "balance_loss_mlp": 1.02035952, + "epoch": 0.03342752359766729, + "flos": 28840290566400.0, + "grad_norm": 2.1896439721597254, + "language_loss": 0.79071772, + "learning_rate": 3.999881069000581e-06, + "loss": 0.81471777, + "num_input_tokens_seen": 5769945, + "step": 278, + "time_per_iteration": 2.7926204204559326 + }, + { + "auxiliary_loss_clip": 0.01287276, + "auxiliary_loss_mlp": 0.01107443, + "balance_loss_clip": 1.09344614, + "balance_loss_mlp": 1.01612878, + "epoch": 0.03354776648830638, + "flos": 19384544090880.0, + "grad_norm": 2.563025766253221, + "language_loss": 0.86840975, + "learning_rate": 3.99987242232342e-06, + "loss": 0.89235699, + "num_input_tokens_seen": 5784950, + "step": 279, + "time_per_iteration": 2.677016496658325 + }, + { + "auxiliary_loss_clip": 0.01287868, + "auxiliary_loss_mlp": 0.01114757, + "balance_loss_clip": 1.09404027, + "balance_loss_mlp": 1.02439654, + "epoch": 0.03366800937894547, + "flos": 17858628472320.0, + "grad_norm": 1.9409507706338502, + "language_loss": 0.79682565, + "learning_rate": 3.9998634722672026e-06, + "loss": 0.82085192, + "num_input_tokens_seen": 5805005, + "step": 280, + "time_per_iteration": 2.7410027980804443 + }, + { + "auxiliary_loss_clip": 0.01289719, + "auxiliary_loss_mlp": 0.01107489, + "balance_loss_clip": 1.09599221, + "balance_loss_mlp": 1.01646042, + "epoch": 0.03378825226958456, + "flos": 35951033635200.0, + "grad_norm": 1.9113352419589114, + "language_loss": 0.78700471, + "learning_rate": 3.999854218833286e-06, + "loss": 0.81097674, + "num_input_tokens_seen": 5825825, + "step": 281, + "time_per_iteration": 2.8881075382232666 + }, + { + "auxiliary_loss_clip": 0.01288372, + "auxiliary_loss_mlp": 0.01112417, + "balance_loss_clip": 1.09492517, + "balance_loss_mlp": 1.02172256, + "epoch": 0.03390849516022365, + "flos": 25702488126720.0, + "grad_norm": 1.9651474508350604, + "language_loss": 0.82266545, + "learning_rate": 3.999844662023075e-06, + "loss": 0.84667331, + "num_input_tokens_seen": 5845700, + "step": 282, + "time_per_iteration": 2.893193483352661 + }, + { + "auxiliary_loss_clip": 0.01287015, + "auxiliary_loss_mlp": 0.0111084, + "balance_loss_clip": 1.09357929, + "balance_loss_mlp": 1.02081275, + "epoch": 0.03402873805086274, + "flos": 21284505987840.0, + "grad_norm": 1.8329924584919837, + "language_loss": 0.92164767, + "learning_rate": 3.999834801838018e-06, + "loss": 0.9456262, + "num_input_tokens_seen": 5864680, + "step": 283, + "time_per_iteration": 2.7643520832061768 + }, + { + "auxiliary_loss_clip": 0.01286824, + "auxiliary_loss_mlp": 0.01107878, + "balance_loss_clip": 1.09389782, + "balance_loss_mlp": 1.01732683, + "epoch": 0.03414898094150183, + "flos": 22710913954560.0, + "grad_norm": 1.8168600140847142, + "language_loss": 0.73981196, + "learning_rate": 3.9998246382796115e-06, + "loss": 0.76375896, + "num_input_tokens_seen": 5884260, + "step": 284, + "time_per_iteration": 2.6731679439544678 + }, + { + "auxiliary_loss_clip": 0.01285655, + "auxiliary_loss_mlp": 0.01104304, + "balance_loss_clip": 1.09241724, + "balance_loss_mlp": 1.01351428, + "epoch": 0.03426922383214093, + "flos": 18879927874560.0, + "grad_norm": 2.1683166531077873, + "language_loss": 0.90799296, + "learning_rate": 3.999814171349399e-06, + "loss": 0.93189263, + "num_input_tokens_seen": 5902120, + "step": 285, + "time_per_iteration": 2.7339603900909424 + }, + { + "auxiliary_loss_clip": 0.01285435, + "auxiliary_loss_mlp": 0.01108504, + "balance_loss_clip": 1.0930661, + "balance_loss_mlp": 1.01823914, + "epoch": 0.03438946672278002, + "flos": 34752012716160.0, + "grad_norm": 1.5750926456542682, + "language_loss": 0.73492599, + "learning_rate": 3.9998034010489655e-06, + "loss": 0.75886536, + "num_input_tokens_seen": 5925810, + "step": 286, + "time_per_iteration": 2.8075811862945557 + }, + { + "auxiliary_loss_clip": 0.01286368, + "auxiliary_loss_mlp": 0.0110749, + "balance_loss_clip": 1.09437871, + "balance_loss_mlp": 1.01770186, + "epoch": 0.03450970961341911, + "flos": 22164102236160.0, + "grad_norm": 2.307830678516599, + "language_loss": 0.76136422, + "learning_rate": 3.999792327379946e-06, + "loss": 0.78530276, + "num_input_tokens_seen": 5945185, + "step": 287, + "time_per_iteration": 2.7875895500183105 + }, + { + "auxiliary_loss_clip": 0.01287403, + "auxiliary_loss_mlp": 0.01104354, + "balance_loss_clip": 1.09567714, + "balance_loss_mlp": 1.01466072, + "epoch": 0.034629952504058197, + "flos": 21725740656000.0, + "grad_norm": 2.1991598544344475, + "language_loss": 0.96170253, + "learning_rate": 3.999780950344021e-06, + "loss": 0.98562008, + "num_input_tokens_seen": 5963375, + "step": 288, + "time_per_iteration": 2.689016819000244 + }, + { + "auxiliary_loss_clip": 0.01286764, + "auxiliary_loss_mlp": 0.01110178, + "balance_loss_clip": 1.09532249, + "balance_loss_mlp": 1.01948345, + "epoch": 0.034750195394697286, + "flos": 20047994248320.0, + "grad_norm": 1.9602976646533077, + "language_loss": 0.82581413, + "learning_rate": 3.999769269942916e-06, + "loss": 0.8497836, + "num_input_tokens_seen": 5983415, + "step": 289, + "time_per_iteration": 2.720937728881836 + }, + { + "auxiliary_loss_clip": 0.01285318, + "auxiliary_loss_mlp": 0.01106198, + "balance_loss_clip": 1.09346294, + "balance_loss_mlp": 1.01617074, + "epoch": 0.034870438285336376, + "flos": 27965865876480.0, + "grad_norm": 2.049044425127534, + "language_loss": 0.81111908, + "learning_rate": 3.999757286178402e-06, + "loss": 0.83503425, + "num_input_tokens_seen": 6005850, + "step": 290, + "time_per_iteration": 2.7250688076019287 + }, + { + "auxiliary_loss_clip": 0.0128443, + "auxiliary_loss_mlp": 0.01109321, + "balance_loss_clip": 1.09285498, + "balance_loss_mlp": 1.01867437, + "epoch": 0.03499068117597547, + "flos": 22017514832640.0, + "grad_norm": 1.7176369426138873, + "language_loss": 0.90709728, + "learning_rate": 3.999744999052299e-06, + "loss": 0.9310348, + "num_input_tokens_seen": 6027240, + "step": 291, + "time_per_iteration": 2.8195641040802 + }, + { + "auxiliary_loss_clip": 0.01314902, + "auxiliary_loss_mlp": 0.01082533, + "balance_loss_clip": 1.12668884, + "balance_loss_mlp": 0.99975455, + "epoch": 0.03511092406661456, + "flos": 57242147725440.0, + "grad_norm": 0.9589189208293083, + "language_loss": 0.61192739, + "learning_rate": 3.9997324085664675e-06, + "loss": 0.63590169, + "num_input_tokens_seen": 6087470, + "step": 292, + "time_per_iteration": 3.2257821559906006 + }, + { + "auxiliary_loss_clip": 0.01283202, + "auxiliary_loss_mlp": 0.0111105, + "balance_loss_clip": 1.09164202, + "balance_loss_mlp": 1.02064157, + "epoch": 0.03523116695725365, + "flos": 22928065626240.0, + "grad_norm": 2.2157849509242085, + "language_loss": 0.92057586, + "learning_rate": 3.999719514722821e-06, + "loss": 0.94451839, + "num_input_tokens_seen": 6107600, + "step": 293, + "time_per_iteration": 2.7253193855285645 + }, + { + "auxiliary_loss_clip": 0.01284106, + "auxiliary_loss_mlp": 0.01104962, + "balance_loss_clip": 1.09342837, + "balance_loss_mlp": 1.01517344, + "epoch": 0.03535140984789274, + "flos": 36903241226880.0, + "grad_norm": 2.2590730959875374, + "language_loss": 0.74920583, + "learning_rate": 3.999706317523314e-06, + "loss": 0.77309656, + "num_input_tokens_seen": 6126160, + "step": 294, + "time_per_iteration": 2.8927228450775146 + }, + { + "auxiliary_loss_clip": 0.01282784, + "auxiliary_loss_mlp": 0.01102894, + "balance_loss_clip": 1.09186602, + "balance_loss_mlp": 1.01353478, + "epoch": 0.03547165273853183, + "flos": 20449152316800.0, + "grad_norm": 2.138494809820146, + "language_loss": 0.86308885, + "learning_rate": 3.999692816969948e-06, + "loss": 0.88694561, + "num_input_tokens_seen": 6145695, + "step": 295, + "time_per_iteration": 2.695070266723633 + }, + { + "auxiliary_loss_clip": 0.01310882, + "auxiliary_loss_mlp": 0.01083287, + "balance_loss_clip": 1.12394762, + "balance_loss_mlp": 1.00050855, + "epoch": 0.03559189562917092, + "flos": 69850564871040.0, + "grad_norm": 0.9910114217071582, + "language_loss": 0.69437301, + "learning_rate": 3.999679013064772e-06, + "loss": 0.71831465, + "num_input_tokens_seen": 6212440, + "step": 296, + "time_per_iteration": 5.280071020126343 + }, + { + "auxiliary_loss_clip": 0.01280739, + "auxiliary_loss_mlp": 0.01106657, + "balance_loss_clip": 1.09045553, + "balance_loss_mlp": 1.01677346, + "epoch": 0.03571213851981002, + "flos": 21651944163840.0, + "grad_norm": 2.5983766333439244, + "language_loss": 0.85770297, + "learning_rate": 3.99966490580988e-06, + "loss": 0.88157696, + "num_input_tokens_seen": 6229800, + "step": 297, + "time_per_iteration": 4.537451267242432 + }, + { + "auxiliary_loss_clip": 0.0128389, + "auxiliary_loss_mlp": 0.01108618, + "balance_loss_clip": 1.0938828, + "balance_loss_mlp": 1.01840007, + "epoch": 0.03583238141044911, + "flos": 43945610757120.0, + "grad_norm": 3.040071414894239, + "language_loss": 0.65889716, + "learning_rate": 3.999650495207411e-06, + "loss": 0.68282223, + "num_input_tokens_seen": 6255825, + "step": 298, + "time_per_iteration": 2.9806621074676514 + }, + { + "auxiliary_loss_clip": 0.012836, + "auxiliary_loss_mlp": 0.01110131, + "balance_loss_clip": 1.09452438, + "balance_loss_mlp": 1.02010417, + "epoch": 0.0359526243010882, + "flos": 18910810592640.0, + "grad_norm": 2.736952447877761, + "language_loss": 0.90652549, + "learning_rate": 3.999635781259553e-06, + "loss": 0.93046284, + "num_input_tokens_seen": 6271090, + "step": 299, + "time_per_iteration": 2.7354965209960938 + }, + { + "auxiliary_loss_clip": 0.01304235, + "auxiliary_loss_mlp": 0.01083732, + "balance_loss_clip": 1.11862195, + "balance_loss_mlp": 1.00095296, + "epoch": 0.03607286719172729, + "flos": 61668892782720.0, + "grad_norm": 0.9152002357248148, + "language_loss": 0.5223521, + "learning_rate": 3.999620763968535e-06, + "loss": 0.54623175, + "num_input_tokens_seen": 6329965, + "step": 300, + "time_per_iteration": 3.1473135948181152 + }, + { + "auxiliary_loss_clip": 0.01279823, + "auxiliary_loss_mlp": 0.01105877, + "balance_loss_clip": 1.09112418, + "balance_loss_mlp": 1.01661265, + "epoch": 0.03619311008236638, + "flos": 27819062991360.0, + "grad_norm": 1.7781139895367557, + "language_loss": 0.86435193, + "learning_rate": 3.999605443336638e-06, + "loss": 0.88820887, + "num_input_tokens_seen": 6352095, + "step": 301, + "time_per_iteration": 2.919443130493164 + }, + { + "auxiliary_loss_clip": 0.01277644, + "auxiliary_loss_mlp": 0.0111018, + "balance_loss_clip": 1.08825231, + "balance_loss_mlp": 1.02015281, + "epoch": 0.03631335297300547, + "flos": 13621133197440.0, + "grad_norm": 3.103396931628289, + "language_loss": 0.89215899, + "learning_rate": 3.999589819366185e-06, + "loss": 0.9160372, + "num_input_tokens_seen": 6365885, + "step": 302, + "time_per_iteration": 2.7426886558532715 + }, + { + "auxiliary_loss_clip": 0.01280479, + "auxiliary_loss_mlp": 0.01105733, + "balance_loss_clip": 1.09163237, + "balance_loss_mlp": 1.01637411, + "epoch": 0.036433595863644565, + "flos": 27631788456960.0, + "grad_norm": 2.6003566161365828, + "language_loss": 0.85006142, + "learning_rate": 3.999573892059547e-06, + "loss": 0.87392348, + "num_input_tokens_seen": 6385015, + "step": 303, + "time_per_iteration": 2.896454334259033 + }, + { + "auxiliary_loss_clip": 0.01279512, + "auxiliary_loss_mlp": 0.01109036, + "balance_loss_clip": 1.09127903, + "balance_loss_mlp": 1.01905668, + "epoch": 0.036553838754283655, + "flos": 24572020314240.0, + "grad_norm": 3.057776762537932, + "language_loss": 0.81021261, + "learning_rate": 3.999557661419138e-06, + "loss": 0.8340981, + "num_input_tokens_seen": 6405165, + "step": 304, + "time_per_iteration": 2.7817881107330322 + }, + { + "auxiliary_loss_clip": 0.01280188, + "auxiliary_loss_mlp": 0.01108268, + "balance_loss_clip": 1.09163022, + "balance_loss_mlp": 1.01876521, + "epoch": 0.036674081644922744, + "flos": 23404313076480.0, + "grad_norm": 2.8481739262826142, + "language_loss": 0.81595361, + "learning_rate": 3.9995411274474225e-06, + "loss": 0.83983815, + "num_input_tokens_seen": 6424445, + "step": 305, + "time_per_iteration": 2.7331619262695312 + }, + { + "auxiliary_loss_clip": 0.01278284, + "auxiliary_loss_mlp": 0.01103861, + "balance_loss_clip": 1.08985591, + "balance_loss_mlp": 1.0142628, + "epoch": 0.036794324535561834, + "flos": 27489690253440.0, + "grad_norm": 4.28637753285291, + "language_loss": 0.8147366, + "learning_rate": 3.999524290146908e-06, + "loss": 0.83855808, + "num_input_tokens_seen": 6444650, + "step": 306, + "time_per_iteration": 2.8067140579223633 + }, + { + "auxiliary_loss_clip": 0.01278411, + "auxiliary_loss_mlp": 0.0110992, + "balance_loss_clip": 1.09098089, + "balance_loss_mlp": 1.02060819, + "epoch": 0.036914567426200924, + "flos": 19463476227840.0, + "grad_norm": 2.1859242351737818, + "language_loss": 0.92771745, + "learning_rate": 3.9995071495201485e-06, + "loss": 0.95160073, + "num_input_tokens_seen": 6461755, + "step": 307, + "time_per_iteration": 2.8461759090423584 + }, + { + "auxiliary_loss_clip": 0.01277655, + "auxiliary_loss_mlp": 0.01105062, + "balance_loss_clip": 1.08998585, + "balance_loss_mlp": 1.0158453, + "epoch": 0.037034810316840014, + "flos": 22309324922880.0, + "grad_norm": 2.4684356289981864, + "language_loss": 0.97966957, + "learning_rate": 3.999489705569744e-06, + "loss": 1.00349677, + "num_input_tokens_seen": 6479455, + "step": 308, + "time_per_iteration": 2.785972833633423 + }, + { + "auxiliary_loss_clip": 0.01276214, + "auxiliary_loss_mlp": 0.01104703, + "balance_loss_clip": 1.08868778, + "balance_loss_mlp": 1.0152483, + "epoch": 0.03715505320747911, + "flos": 18588333265920.0, + "grad_norm": 3.248205304244984, + "language_loss": 0.86232376, + "learning_rate": 3.999471958298341e-06, + "loss": 0.88613296, + "num_input_tokens_seen": 6498365, + "step": 309, + "time_per_iteration": 2.809359073638916 + }, + { + "auxiliary_loss_clip": 0.01276741, + "auxiliary_loss_mlp": 0.01107789, + "balance_loss_clip": 1.0895015, + "balance_loss_mlp": 1.01842952, + "epoch": 0.0372752960981182, + "flos": 35955343267200.0, + "grad_norm": 1.9145861282425496, + "language_loss": 0.76300263, + "learning_rate": 3.999453907708631e-06, + "loss": 0.78684795, + "num_input_tokens_seen": 6520770, + "step": 310, + "time_per_iteration": 2.9851202964782715 + }, + { + "auxiliary_loss_clip": 0.01275464, + "auxiliary_loss_mlp": 0.01106797, + "balance_loss_clip": 1.08798826, + "balance_loss_mlp": 1.01748574, + "epoch": 0.03739553898875729, + "flos": 20814040627200.0, + "grad_norm": 1.7159181878661511, + "language_loss": 0.81249845, + "learning_rate": 3.999435553803353e-06, + "loss": 0.836321, + "num_input_tokens_seen": 6540170, + "step": 311, + "time_per_iteration": 2.7633180618286133 + }, + { + "auxiliary_loss_clip": 0.01273537, + "auxiliary_loss_mlp": 0.01104698, + "balance_loss_clip": 1.08625805, + "balance_loss_mlp": 1.01452804, + "epoch": 0.03751578187939638, + "flos": 20264140339200.0, + "grad_norm": 2.899747144045497, + "language_loss": 0.83472061, + "learning_rate": 3.999416896585292e-06, + "loss": 0.85850292, + "num_input_tokens_seen": 6557200, + "step": 312, + "time_per_iteration": 2.85109543800354 + }, + { + "auxiliary_loss_clip": 0.01275044, + "auxiliary_loss_mlp": 0.0110545, + "balance_loss_clip": 1.08813524, + "balance_loss_mlp": 1.01570904, + "epoch": 0.03763602477003547, + "flos": 20668063754880.0, + "grad_norm": 3.016380380309105, + "language_loss": 0.85420585, + "learning_rate": 3.9993979360572775e-06, + "loss": 0.87801075, + "num_input_tokens_seen": 6577340, + "step": 313, + "time_per_iteration": 2.8209311962127686 + }, + { + "auxiliary_loss_clip": 0.01276768, + "auxiliary_loss_mlp": 0.01105562, + "balance_loss_clip": 1.0897181, + "balance_loss_mlp": 1.01605964, + "epoch": 0.03775626766067456, + "flos": 16691352197760.0, + "grad_norm": 3.2953441374563526, + "language_loss": 0.82748473, + "learning_rate": 3.999378672222185e-06, + "loss": 0.85130805, + "num_input_tokens_seen": 6595125, + "step": 314, + "time_per_iteration": 2.8336591720581055 + }, + { + "auxiliary_loss_clip": 0.01275315, + "auxiliary_loss_mlp": 0.01103925, + "balance_loss_clip": 1.08904362, + "balance_loss_mlp": 1.0147084, + "epoch": 0.03787651055131366, + "flos": 21141797253120.0, + "grad_norm": 2.3037646921018897, + "language_loss": 0.82693428, + "learning_rate": 3.9993591050829385e-06, + "loss": 0.85072666, + "num_input_tokens_seen": 6612990, + "step": 315, + "time_per_iteration": 2.9689719676971436 + }, + { + "auxiliary_loss_clip": 0.01276747, + "auxiliary_loss_mlp": 0.01109017, + "balance_loss_clip": 1.09008455, + "balance_loss_mlp": 1.01908565, + "epoch": 0.037996753441952746, + "flos": 22018089450240.0, + "grad_norm": 1.9765172567777527, + "language_loss": 0.79247987, + "learning_rate": 3.999339234642506e-06, + "loss": 0.81633747, + "num_input_tokens_seen": 6632740, + "step": 316, + "time_per_iteration": 2.78717041015625 + }, + { + "auxiliary_loss_clip": 0.01274415, + "auxiliary_loss_mlp": 0.01105696, + "balance_loss_clip": 1.0883925, + "balance_loss_mlp": 1.01590753, + "epoch": 0.038116996332591836, + "flos": 27709391790720.0, + "grad_norm": 1.945594835412296, + "language_loss": 0.83725375, + "learning_rate": 3.9993190609038994e-06, + "loss": 0.8610549, + "num_input_tokens_seen": 6651505, + "step": 317, + "time_per_iteration": 3.1141703128814697 + }, + { + "auxiliary_loss_clip": 0.01274631, + "auxiliary_loss_mlp": 0.01108757, + "balance_loss_clip": 1.08822834, + "balance_loss_mlp": 1.01968431, + "epoch": 0.038237239223230926, + "flos": 21178067011200.0, + "grad_norm": 2.019616686282309, + "language_loss": 0.82994843, + "learning_rate": 3.999298583870182e-06, + "loss": 0.85378236, + "num_input_tokens_seen": 6671090, + "step": 318, + "time_per_iteration": 2.8416731357574463 + }, + { + "auxiliary_loss_clip": 0.01271773, + "auxiliary_loss_mlp": 0.01107654, + "balance_loss_clip": 1.0862093, + "balance_loss_mlp": 1.01834202, + "epoch": 0.038357482113870016, + "flos": 25556618995200.0, + "grad_norm": 2.5215042910046006, + "language_loss": 0.77405429, + "learning_rate": 3.999277803544458e-06, + "loss": 0.79784858, + "num_input_tokens_seen": 6691245, + "step": 319, + "time_per_iteration": 2.873650074005127 + }, + { + "auxiliary_loss_clip": 0.01287883, + "auxiliary_loss_mlp": 0.01084804, + "balance_loss_clip": 1.10713744, + "balance_loss_mlp": 1.0024066, + "epoch": 0.038477725004509106, + "flos": 59227578034560.0, + "grad_norm": 0.947403295598017, + "language_loss": 0.62446052, + "learning_rate": 3.999256719929882e-06, + "loss": 0.64818746, + "num_input_tokens_seen": 6752520, + "step": 320, + "time_per_iteration": 3.2843034267425537 + }, + { + "auxiliary_loss_clip": 0.01287233, + "auxiliary_loss_mlp": 0.01083799, + "balance_loss_clip": 1.10692453, + "balance_loss_mlp": 1.00140142, + "epoch": 0.0385979678951482, + "flos": 67317676398720.0, + "grad_norm": 1.2304698907274627, + "language_loss": 0.6708129, + "learning_rate": 3.999235333029651e-06, + "loss": 0.69452316, + "num_input_tokens_seen": 6806460, + "step": 321, + "time_per_iteration": 3.21026611328125 + }, + { + "auxiliary_loss_clip": 0.01274709, + "auxiliary_loss_mlp": 0.01106248, + "balance_loss_clip": 1.08902586, + "balance_loss_mlp": 1.0172224, + "epoch": 0.03871821078578729, + "flos": 22746752749440.0, + "grad_norm": 1.8738199314340795, + "language_loss": 0.81974691, + "learning_rate": 3.999213642847009e-06, + "loss": 0.8435564, + "num_input_tokens_seen": 6827045, + "step": 322, + "time_per_iteration": 4.85815167427063 + }, + { + "auxiliary_loss_clip": 0.01271779, + "auxiliary_loss_mlp": 0.01103368, + "balance_loss_clip": 1.08641744, + "balance_loss_mlp": 1.01400876, + "epoch": 0.03883845367642638, + "flos": 26280613526400.0, + "grad_norm": 1.733528660052852, + "language_loss": 0.91051668, + "learning_rate": 3.999191649385247e-06, + "loss": 0.93426812, + "num_input_tokens_seen": 6848220, + "step": 323, + "time_per_iteration": 4.619458436965942 + }, + { + "auxiliary_loss_clip": 0.01283146, + "auxiliary_loss_mlp": 0.01083469, + "balance_loss_clip": 1.10377789, + "balance_loss_mlp": 1.00107121, + "epoch": 0.03895869656706547, + "flos": 56962835568000.0, + "grad_norm": 0.9087969509720386, + "language_loss": 0.59769821, + "learning_rate": 3.999169352647702e-06, + "loss": 0.62136436, + "num_input_tokens_seen": 6909400, + "step": 324, + "time_per_iteration": 3.2363157272338867 + }, + { + "auxiliary_loss_clip": 0.01271343, + "auxiliary_loss_mlp": 0.01103804, + "balance_loss_clip": 1.08637106, + "balance_loss_mlp": 1.01363373, + "epoch": 0.03907893945770456, + "flos": 24863363527680.0, + "grad_norm": 2.8667382001411554, + "language_loss": 0.83011347, + "learning_rate": 3.999146752637755e-06, + "loss": 0.85386497, + "num_input_tokens_seen": 6930445, + "step": 325, + "time_per_iteration": 2.7554426193237305 + }, + { + "auxiliary_loss_clip": 0.01270262, + "auxiliary_loss_mlp": 0.01106127, + "balance_loss_clip": 1.08558905, + "balance_loss_mlp": 1.01657677, + "epoch": 0.03919918234834365, + "flos": 18368595815040.0, + "grad_norm": 29.210888137778586, + "language_loss": 0.89333141, + "learning_rate": 3.999123849358836e-06, + "loss": 0.91709536, + "num_input_tokens_seen": 6948110, + "step": 326, + "time_per_iteration": 2.797146797180176 + }, + { + "auxiliary_loss_clip": 0.01270148, + "auxiliary_loss_mlp": 0.01102519, + "balance_loss_clip": 1.08566439, + "balance_loss_mlp": 1.01363635, + "epoch": 0.03931942523898275, + "flos": 25225414663680.0, + "grad_norm": 1.8539705371002526, + "language_loss": 0.74548018, + "learning_rate": 3.999100642814418e-06, + "loss": 0.76920688, + "num_input_tokens_seen": 6968550, + "step": 327, + "time_per_iteration": 2.7045772075653076 + }, + { + "auxiliary_loss_clip": 0.01270353, + "auxiliary_loss_mlp": 0.01108717, + "balance_loss_clip": 1.08622491, + "balance_loss_mlp": 1.01907146, + "epoch": 0.03943966812962184, + "flos": 23257905240960.0, + "grad_norm": 2.3031848536971617, + "language_loss": 0.88612235, + "learning_rate": 3.999077133008022e-06, + "loss": 0.90991306, + "num_input_tokens_seen": 6987135, + "step": 328, + "time_per_iteration": 2.753023862838745 + }, + { + "auxiliary_loss_clip": 0.01271404, + "auxiliary_loss_mlp": 0.01106653, + "balance_loss_clip": 1.08722222, + "balance_loss_mlp": 1.01700795, + "epoch": 0.03955991102026093, + "flos": 29168837291520.0, + "grad_norm": 1.8886681850077633, + "language_loss": 0.90804791, + "learning_rate": 3.9990533199432145e-06, + "loss": 0.9318285, + "num_input_tokens_seen": 7008630, + "step": 329, + "time_per_iteration": 2.801905632019043 + }, + { + "auxiliary_loss_clip": 0.01270904, + "auxiliary_loss_mlp": 0.01111086, + "balance_loss_clip": 1.08698428, + "balance_loss_mlp": 1.02158332, + "epoch": 0.03968015391090002, + "flos": 17602441695360.0, + "grad_norm": 2.472420506499299, + "language_loss": 0.75223064, + "learning_rate": 3.999029203623608e-06, + "loss": 0.77605057, + "num_input_tokens_seen": 7026350, + "step": 330, + "time_per_iteration": 2.758068799972534 + }, + { + "auxiliary_loss_clip": 0.01269047, + "auxiliary_loss_mlp": 0.01106518, + "balance_loss_clip": 1.08574605, + "balance_loss_mlp": 1.01768327, + "epoch": 0.03980039680153911, + "flos": 21799285752960.0, + "grad_norm": 2.0054160194902497, + "language_loss": 0.86768067, + "learning_rate": 3.99900478405286e-06, + "loss": 0.89143628, + "num_input_tokens_seen": 7045660, + "step": 331, + "time_per_iteration": 2.765371799468994 + }, + { + "auxiliary_loss_clip": 0.0127064, + "auxiliary_loss_mlp": 0.01105008, + "balance_loss_clip": 1.08700538, + "balance_loss_mlp": 1.01607752, + "epoch": 0.0399206396921782, + "flos": 15195134148480.0, + "grad_norm": 2.2953744405065772, + "language_loss": 0.82381994, + "learning_rate": 3.998980061234676e-06, + "loss": 0.84757644, + "num_input_tokens_seen": 7063575, + "step": 332, + "time_per_iteration": 2.6883575916290283 + }, + { + "auxiliary_loss_clip": 0.01271187, + "auxiliary_loss_mlp": 0.01107235, + "balance_loss_clip": 1.08723509, + "balance_loss_mlp": 1.01801848, + "epoch": 0.040040882582817294, + "flos": 14422910630400.0, + "grad_norm": 2.5662743520568214, + "language_loss": 0.75505245, + "learning_rate": 3.9989550351728055e-06, + "loss": 0.77883667, + "num_input_tokens_seen": 7080505, + "step": 333, + "time_per_iteration": 2.7338707447052 + }, + { + "auxiliary_loss_clip": 0.01266964, + "auxiliary_loss_mlp": 0.01103991, + "balance_loss_clip": 1.08413696, + "balance_loss_mlp": 1.01515663, + "epoch": 0.040161125473456384, + "flos": 19280906375040.0, + "grad_norm": 2.429385700040948, + "language_loss": 0.84383309, + "learning_rate": 3.998929705871046e-06, + "loss": 0.86754268, + "num_input_tokens_seen": 7097860, + "step": 334, + "time_per_iteration": 2.649466037750244 + }, + { + "auxiliary_loss_clip": 0.01268984, + "auxiliary_loss_mlp": 0.01106917, + "balance_loss_clip": 1.08597815, + "balance_loss_mlp": 1.01832032, + "epoch": 0.040281368364095474, + "flos": 17821101738240.0, + "grad_norm": 2.8918764808462374, + "language_loss": 0.88600415, + "learning_rate": 3.99890407333324e-06, + "loss": 0.9097631, + "num_input_tokens_seen": 7116390, + "step": 335, + "time_per_iteration": 2.700169324874878 + }, + { + "auxiliary_loss_clip": 0.01266745, + "auxiliary_loss_mlp": 0.01106637, + "balance_loss_clip": 1.0837276, + "balance_loss_mlp": 1.01737261, + "epoch": 0.040401611254734564, + "flos": 19573757959680.0, + "grad_norm": 2.7684130483236453, + "language_loss": 0.87045634, + "learning_rate": 3.998878137563275e-06, + "loss": 0.89419019, + "num_input_tokens_seen": 7135940, + "step": 336, + "time_per_iteration": 2.7291438579559326 + }, + { + "auxiliary_loss_clip": 0.01267649, + "auxiliary_loss_mlp": 0.01105685, + "balance_loss_clip": 1.08475852, + "balance_loss_mlp": 1.01608706, + "epoch": 0.040521854145373654, + "flos": 22054466949120.0, + "grad_norm": 2.0773657588243695, + "language_loss": 0.8528266, + "learning_rate": 3.998851898565085e-06, + "loss": 0.87655991, + "num_input_tokens_seen": 7155745, + "step": 337, + "time_per_iteration": 2.755781412124634 + }, + { + "auxiliary_loss_clip": 0.01265976, + "auxiliary_loss_mlp": 0.01105768, + "balance_loss_clip": 1.08267808, + "balance_loss_mlp": 1.01636124, + "epoch": 0.04064209703601274, + "flos": 22674644196480.0, + "grad_norm": 3.732141204061747, + "language_loss": 0.83011222, + "learning_rate": 3.998825356342653e-06, + "loss": 0.85382968, + "num_input_tokens_seen": 7175920, + "step": 338, + "time_per_iteration": 2.713850498199463 + }, + { + "auxiliary_loss_clip": 0.01264474, + "auxiliary_loss_mlp": 0.0110387, + "balance_loss_clip": 1.08158696, + "balance_loss_mlp": 1.01489222, + "epoch": 0.04076233992665183, + "flos": 38582172783360.0, + "grad_norm": 4.152026133845938, + "language_loss": 0.72842777, + "learning_rate": 3.998798510900003e-06, + "loss": 0.75211114, + "num_input_tokens_seen": 7198720, + "step": 339, + "time_per_iteration": 2.8098983764648438 + }, + { + "auxiliary_loss_clip": 0.01265032, + "auxiliary_loss_mlp": 0.01104404, + "balance_loss_clip": 1.08285427, + "balance_loss_mlp": 1.01537848, + "epoch": 0.04088258281729093, + "flos": 25885309374720.0, + "grad_norm": 2.2359208782241375, + "language_loss": 0.84079373, + "learning_rate": 3.998771362241207e-06, + "loss": 0.86448807, + "num_input_tokens_seen": 7219125, + "step": 340, + "time_per_iteration": 2.756596565246582 + }, + { + "auxiliary_loss_clip": 0.01264788, + "auxiliary_loss_mlp": 0.01100834, + "balance_loss_clip": 1.08268452, + "balance_loss_mlp": 1.01219034, + "epoch": 0.04100282570793002, + "flos": 19789832223360.0, + "grad_norm": 2.360185716519728, + "language_loss": 0.87883568, + "learning_rate": 3.998743910370385e-06, + "loss": 0.90249187, + "num_input_tokens_seen": 7237985, + "step": 341, + "time_per_iteration": 2.721733570098877 + }, + { + "auxiliary_loss_clip": 0.0126973, + "auxiliary_loss_mlp": 0.01112984, + "balance_loss_clip": 1.08810973, + "balance_loss_mlp": 1.02357721, + "epoch": 0.04112306859856911, + "flos": 22565152563840.0, + "grad_norm": 2.119881688395707, + "language_loss": 0.73252976, + "learning_rate": 3.998716155291702e-06, + "loss": 0.75635695, + "num_input_tokens_seen": 7255825, + "step": 342, + "time_per_iteration": 2.751804828643799 + }, + { + "auxiliary_loss_clip": 0.01268177, + "auxiliary_loss_mlp": 0.01102912, + "balance_loss_clip": 1.08601177, + "balance_loss_mlp": 1.01383889, + "epoch": 0.0412433114892082, + "flos": 25040654081280.0, + "grad_norm": 1.7255901652378722, + "language_loss": 0.90412259, + "learning_rate": 3.998688097009366e-06, + "loss": 0.92783356, + "num_input_tokens_seen": 7276590, + "step": 343, + "time_per_iteration": 2.723381519317627 + }, + { + "auxiliary_loss_clip": 0.01266794, + "auxiliary_loss_mlp": 0.01106307, + "balance_loss_clip": 1.08489585, + "balance_loss_mlp": 1.01742458, + "epoch": 0.04136355437984729, + "flos": 25191371548800.0, + "grad_norm": 3.4507294908771433, + "language_loss": 0.79978794, + "learning_rate": 3.998659735527636e-06, + "loss": 0.82351899, + "num_input_tokens_seen": 7295680, + "step": 344, + "time_per_iteration": 2.7156481742858887 + }, + { + "auxiliary_loss_clip": 0.01264257, + "auxiliary_loss_mlp": 0.01104656, + "balance_loss_clip": 1.08333349, + "balance_loss_mlp": 1.01615477, + "epoch": 0.04148379727048638, + "flos": 22966777509120.0, + "grad_norm": 1.7369199720763417, + "language_loss": 0.77599466, + "learning_rate": 3.998631070850813e-06, + "loss": 0.79968381, + "num_input_tokens_seen": 7316300, + "step": 345, + "time_per_iteration": 2.657468557357788 + }, + { + "auxiliary_loss_clip": 0.01264272, + "auxiliary_loss_mlp": 0.01105368, + "balance_loss_clip": 1.08326936, + "balance_loss_mlp": 1.01715279, + "epoch": 0.041604040161125476, + "flos": 14063481187200.0, + "grad_norm": 2.612802424106273, + "language_loss": 0.83449793, + "learning_rate": 3.9986021029832455e-06, + "loss": 0.85819435, + "num_input_tokens_seen": 7333615, + "step": 346, + "time_per_iteration": 2.75141978263855 + }, + { + "auxiliary_loss_clip": 0.01262096, + "auxiliary_loss_mlp": 0.01101964, + "balance_loss_clip": 1.08100605, + "balance_loss_mlp": 1.01250982, + "epoch": 0.041724283051764566, + "flos": 12091877614080.0, + "grad_norm": 3.620931605529322, + "language_loss": 0.91593999, + "learning_rate": 3.9985728319293285e-06, + "loss": 0.93958056, + "num_input_tokens_seen": 7347590, + "step": 347, + "time_per_iteration": 3.6329245567321777 + }, + { + "auxiliary_loss_clip": 0.01263065, + "auxiliary_loss_mlp": 0.01107594, + "balance_loss_clip": 1.08130455, + "balance_loss_mlp": 1.01761484, + "epoch": 0.041844525942403656, + "flos": 12385303816320.0, + "grad_norm": 2.1771657268355105, + "language_loss": 0.84858382, + "learning_rate": 3.998543257693501e-06, + "loss": 0.87229037, + "num_input_tokens_seen": 7364345, + "step": 348, + "time_per_iteration": 3.6428890228271484 + }, + { + "auxiliary_loss_clip": 0.01264232, + "auxiliary_loss_mlp": 0.01103238, + "balance_loss_clip": 1.08291769, + "balance_loss_mlp": 1.01497531, + "epoch": 0.041964768833042745, + "flos": 23769345041280.0, + "grad_norm": 2.0339410471952633, + "language_loss": 0.87868023, + "learning_rate": 3.998513380280251e-06, + "loss": 0.90235496, + "num_input_tokens_seen": 7384625, + "step": 349, + "time_per_iteration": 3.769749402999878 + }, + { + "auxiliary_loss_clip": 0.01264467, + "auxiliary_loss_mlp": 0.01110913, + "balance_loss_clip": 1.08350945, + "balance_loss_mlp": 1.02074349, + "epoch": 0.042085011723681835, + "flos": 11875336473600.0, + "grad_norm": 2.3535100204299724, + "language_loss": 0.94922209, + "learning_rate": 3.99848319969411e-06, + "loss": 0.97297585, + "num_input_tokens_seen": 7402225, + "step": 350, + "time_per_iteration": 2.624035358428955 + }, + { + "auxiliary_loss_clip": 0.01264069, + "auxiliary_loss_mlp": 0.01107927, + "balance_loss_clip": 1.0832665, + "balance_loss_mlp": 1.0193305, + "epoch": 0.042205254614320925, + "flos": 16873957964160.0, + "grad_norm": 2.137353355449484, + "language_loss": 0.79064012, + "learning_rate": 3.9984527159396564e-06, + "loss": 0.81436014, + "num_input_tokens_seen": 7420865, + "step": 351, + "time_per_iteration": 2.6149423122406006 + }, + { + "auxiliary_loss_clip": 0.01262808, + "auxiliary_loss_mlp": 0.01101636, + "balance_loss_clip": 1.08207655, + "balance_loss_mlp": 1.01246715, + "epoch": 0.04232549750496002, + "flos": 25118508810240.0, + "grad_norm": 2.107277844861826, + "language_loss": 0.84303975, + "learning_rate": 3.9984219290215154e-06, + "loss": 0.86668426, + "num_input_tokens_seen": 7441040, + "step": 352, + "time_per_iteration": 2.686621904373169 + }, + { + "auxiliary_loss_clip": 0.01261434, + "auxiliary_loss_mlp": 0.01102512, + "balance_loss_clip": 1.08058977, + "balance_loss_mlp": 1.01420212, + "epoch": 0.04244574039559911, + "flos": 26724541714560.0, + "grad_norm": 1.5859340592881308, + "language_loss": 0.89007372, + "learning_rate": 3.998390838944356e-06, + "loss": 0.91371322, + "num_input_tokens_seen": 7462545, + "step": 353, + "time_per_iteration": 2.673867702484131 + }, + { + "auxiliary_loss_clip": 0.01261982, + "auxiliary_loss_mlp": 0.01105566, + "balance_loss_clip": 1.08163106, + "balance_loss_mlp": 1.01735091, + "epoch": 0.0425659832862382, + "flos": 20923244951040.0, + "grad_norm": 2.105549346415382, + "language_loss": 0.90524685, + "learning_rate": 3.998359445712895e-06, + "loss": 0.92892241, + "num_input_tokens_seen": 7481650, + "step": 354, + "time_per_iteration": 2.628727436065674 + }, + { + "auxiliary_loss_clip": 0.01261125, + "auxiliary_loss_mlp": 0.01100984, + "balance_loss_clip": 1.08076227, + "balance_loss_mlp": 1.01100445, + "epoch": 0.04268622617687729, + "flos": 23331127115520.0, + "grad_norm": 3.363155674724037, + "language_loss": 0.81172514, + "learning_rate": 3.9983277493318955e-06, + "loss": 0.83534622, + "num_input_tokens_seen": 7500945, + "step": 355, + "time_per_iteration": 2.7291293144226074 + }, + { + "auxiliary_loss_clip": 0.01261036, + "auxiliary_loss_mlp": 0.01102484, + "balance_loss_clip": 1.08039308, + "balance_loss_mlp": 1.01350665, + "epoch": 0.04280646906751638, + "flos": 25994010908160.0, + "grad_norm": 1.7040865542193215, + "language_loss": 0.81102061, + "learning_rate": 3.998295749806165e-06, + "loss": 0.83465582, + "num_input_tokens_seen": 7522170, + "step": 356, + "time_per_iteration": 2.713106632232666 + }, + { + "auxiliary_loss_clip": 0.01262731, + "auxiliary_loss_mlp": 0.01100329, + "balance_loss_clip": 1.08272851, + "balance_loss_mlp": 1.011971, + "epoch": 0.04292671195815547, + "flos": 26906824258560.0, + "grad_norm": 2.30370267776026, + "language_loss": 0.83392054, + "learning_rate": 3.998263447140558e-06, + "loss": 0.8575511, + "num_input_tokens_seen": 7542370, + "step": 357, + "time_per_iteration": 2.706547737121582 + }, + { + "auxiliary_loss_clip": 0.01258719, + "auxiliary_loss_mlp": 0.01105188, + "balance_loss_clip": 1.07861936, + "balance_loss_mlp": 1.01621056, + "epoch": 0.04304695484879457, + "flos": 39457315745280.0, + "grad_norm": 2.061343007574263, + "language_loss": 0.81696659, + "learning_rate": 3.998230841339976e-06, + "loss": 0.84060574, + "num_input_tokens_seen": 7564380, + "step": 358, + "time_per_iteration": 2.8739259243011475 + }, + { + "auxiliary_loss_clip": 0.01262495, + "auxiliary_loss_mlp": 0.01108722, + "balance_loss_clip": 1.08290482, + "balance_loss_mlp": 1.01969647, + "epoch": 0.04316719773943366, + "flos": 19646297475840.0, + "grad_norm": 2.0857568086047684, + "language_loss": 0.84889704, + "learning_rate": 3.998197932409363e-06, + "loss": 0.87260914, + "num_input_tokens_seen": 7582390, + "step": 359, + "time_per_iteration": 2.748945474624634 + }, + { + "auxiliary_loss_clip": 0.01260767, + "auxiliary_loss_mlp": 0.01105247, + "balance_loss_clip": 1.08131742, + "balance_loss_mlp": 1.01688862, + "epoch": 0.04328744063007275, + "flos": 22452320966400.0, + "grad_norm": 2.626183986855677, + "language_loss": 0.86375469, + "learning_rate": 3.9981647203537125e-06, + "loss": 0.88741481, + "num_input_tokens_seen": 7599890, + "step": 360, + "time_per_iteration": 2.7118303775787354 + }, + { + "auxiliary_loss_clip": 0.01261378, + "auxiliary_loss_mlp": 0.0110396, + "balance_loss_clip": 1.08210111, + "balance_loss_mlp": 1.01512551, + "epoch": 0.04340768352071184, + "flos": 21283033530240.0, + "grad_norm": 1.9816581686298, + "language_loss": 0.95923197, + "learning_rate": 3.998131205178063e-06, + "loss": 0.98288536, + "num_input_tokens_seen": 7618360, + "step": 361, + "time_per_iteration": 2.649271011352539 + }, + { + "auxiliary_loss_clip": 0.01259506, + "auxiliary_loss_mlp": 0.01104115, + "balance_loss_clip": 1.08051658, + "balance_loss_mlp": 1.01504183, + "epoch": 0.04352792641135093, + "flos": 11583705951360.0, + "grad_norm": 4.273880212955633, + "language_loss": 0.76420963, + "learning_rate": 3.998097386887498e-06, + "loss": 0.78784585, + "num_input_tokens_seen": 7635435, + "step": 362, + "time_per_iteration": 2.755418300628662 + }, + { + "auxiliary_loss_clip": 0.0125918, + "auxiliary_loss_mlp": 0.01103316, + "balance_loss_clip": 1.08014488, + "balance_loss_mlp": 1.01538742, + "epoch": 0.04364816930199002, + "flos": 23623547736960.0, + "grad_norm": 1.7115323163724305, + "language_loss": 0.84729314, + "learning_rate": 3.998063265487148e-06, + "loss": 0.8709181, + "num_input_tokens_seen": 7656485, + "step": 363, + "time_per_iteration": 2.7921457290649414 + }, + { + "auxiliary_loss_clip": 0.01259119, + "auxiliary_loss_mlp": 0.01107126, + "balance_loss_clip": 1.07995403, + "balance_loss_mlp": 1.01852918, + "epoch": 0.043768412192629114, + "flos": 14429734214400.0, + "grad_norm": 2.025246355930404, + "language_loss": 0.80880725, + "learning_rate": 3.99802884098219e-06, + "loss": 0.8324697, + "num_input_tokens_seen": 7674595, + "step": 364, + "time_per_iteration": 2.7098429203033447 + }, + { + "auxiliary_loss_clip": 0.01257166, + "auxiliary_loss_mlp": 0.0110177, + "balance_loss_clip": 1.07856548, + "balance_loss_mlp": 1.0129354, + "epoch": 0.043888655083268203, + "flos": 26468893641600.0, + "grad_norm": 2.337330709464763, + "language_loss": 0.82290369, + "learning_rate": 3.997994113377845e-06, + "loss": 0.84649301, + "num_input_tokens_seen": 7693495, + "step": 365, + "time_per_iteration": 2.7445900440216064 + }, + { + "auxiliary_loss_clip": 0.01258709, + "auxiliary_loss_mlp": 0.01103681, + "balance_loss_clip": 1.08036804, + "balance_loss_mlp": 1.01537085, + "epoch": 0.04400889797390729, + "flos": 27235263242880.0, + "grad_norm": 9.675170085562245, + "language_loss": 0.8339113, + "learning_rate": 3.9979590826793815e-06, + "loss": 0.85753518, + "num_input_tokens_seen": 7714685, + "step": 366, + "time_per_iteration": 2.749793767929077 + }, + { + "auxiliary_loss_clip": 0.01260412, + "auxiliary_loss_mlp": 0.01102658, + "balance_loss_clip": 1.08216667, + "balance_loss_mlp": 1.01387143, + "epoch": 0.04412914086454638, + "flos": 20119528183680.0, + "grad_norm": 2.254080708072454, + "language_loss": 0.80726552, + "learning_rate": 3.997923748892113e-06, + "loss": 0.83089626, + "num_input_tokens_seen": 7734005, + "step": 367, + "time_per_iteration": 2.7261271476745605 + }, + { + "auxiliary_loss_clip": 0.01258552, + "auxiliary_loss_mlp": 0.01105127, + "balance_loss_clip": 1.0802362, + "balance_loss_mlp": 1.0168165, + "epoch": 0.04424938375518547, + "flos": 22604618632320.0, + "grad_norm": 1.641521337662293, + "language_loss": 0.88598299, + "learning_rate": 3.9978881120214015e-06, + "loss": 0.90961969, + "num_input_tokens_seen": 7755525, + "step": 368, + "time_per_iteration": 2.7107229232788086 + }, + { + "auxiliary_loss_clip": 0.01257484, + "auxiliary_loss_mlp": 0.01101149, + "balance_loss_clip": 1.07925785, + "balance_loss_mlp": 1.01231444, + "epoch": 0.04436962664582456, + "flos": 24132365844480.0, + "grad_norm": 2.5608471332514355, + "language_loss": 0.79779965, + "learning_rate": 3.997852172072652e-06, + "loss": 0.82138598, + "num_input_tokens_seen": 7776740, + "step": 369, + "time_per_iteration": 2.7268946170806885 + }, + { + "auxiliary_loss_clip": 0.01258298, + "auxiliary_loss_mlp": 0.01103203, + "balance_loss_clip": 1.07985413, + "balance_loss_mlp": 1.01432061, + "epoch": 0.04448986953646366, + "flos": 18222906251520.0, + "grad_norm": 2.5400429299745344, + "language_loss": 0.89032376, + "learning_rate": 3.9978159290513155e-06, + "loss": 0.91393882, + "num_input_tokens_seen": 7794820, + "step": 370, + "time_per_iteration": 2.6682310104370117 + }, + { + "auxiliary_loss_clip": 0.01260542, + "auxiliary_loss_mlp": 0.01106308, + "balance_loss_clip": 1.08230662, + "balance_loss_mlp": 1.01723528, + "epoch": 0.04461011242710275, + "flos": 30117920400000.0, + "grad_norm": 2.8369429014508376, + "language_loss": 0.80156994, + "learning_rate": 3.997779382962892e-06, + "loss": 0.82523841, + "num_input_tokens_seen": 7817705, + "step": 371, + "time_per_iteration": 2.6948933601379395 + }, + { + "auxiliary_loss_clip": 0.01255074, + "auxiliary_loss_mlp": 0.01105337, + "balance_loss_clip": 1.07692313, + "balance_loss_mlp": 1.01678824, + "epoch": 0.04473035531774184, + "flos": 29752529299200.0, + "grad_norm": 1.8203625906279934, + "language_loss": 0.73680115, + "learning_rate": 3.997742533812924e-06, + "loss": 0.7604053, + "num_input_tokens_seen": 7840970, + "step": 372, + "time_per_iteration": 2.7154994010925293 + }, + { + "auxiliary_loss_clip": 0.01257376, + "auxiliary_loss_mlp": 0.01102623, + "balance_loss_clip": 1.07927084, + "balance_loss_mlp": 1.01412177, + "epoch": 0.04485059820838093, + "flos": 13151565676800.0, + "grad_norm": 2.579629584375704, + "language_loss": 0.92497098, + "learning_rate": 3.997705381607001e-06, + "loss": 0.94857097, + "num_input_tokens_seen": 7857785, + "step": 373, + "time_per_iteration": 3.6243300437927246 + }, + { + "auxiliary_loss_clip": 0.01264821, + "auxiliary_loss_mlp": 0.01083094, + "balance_loss_clip": 1.09439611, + "balance_loss_mlp": 1.00069606, + "epoch": 0.04497084109902002, + "flos": 68094209548800.0, + "grad_norm": 6.194742795995657, + "language_loss": 0.60296953, + "learning_rate": 3.997667926350761e-06, + "loss": 0.62644863, + "num_input_tokens_seen": 7916115, + "step": 374, + "time_per_iteration": 5.980962753295898 + }, + { + "auxiliary_loss_clip": 0.01264791, + "auxiliary_loss_mlp": 0.01083396, + "balance_loss_clip": 1.09446502, + "balance_loss_mlp": 1.00099897, + "epoch": 0.04509108398965911, + "flos": 64342263346560.0, + "grad_norm": 0.9056753438041001, + "language_loss": 0.57797682, + "learning_rate": 3.997630168049886e-06, + "loss": 0.60145867, + "num_input_tokens_seen": 7974480, + "step": 375, + "time_per_iteration": 3.212451219558716 + }, + { + "auxiliary_loss_clip": 0.01254961, + "auxiliary_loss_mlp": 0.01102674, + "balance_loss_clip": 1.07752299, + "balance_loss_mlp": 1.01364839, + "epoch": 0.045211326880298205, + "flos": 22271115830400.0, + "grad_norm": 2.0270277733442046, + "language_loss": 0.7740137, + "learning_rate": 3.997592106710101e-06, + "loss": 0.79759002, + "num_input_tokens_seen": 7993940, + "step": 376, + "time_per_iteration": 2.6931135654449463 + }, + { + "auxiliary_loss_clip": 0.01255184, + "auxiliary_loss_mlp": 0.01103824, + "balance_loss_clip": 1.07797265, + "balance_loss_mlp": 1.01570487, + "epoch": 0.045331569770937295, + "flos": 32159441796480.0, + "grad_norm": 3.3704784596894966, + "language_loss": 0.66029841, + "learning_rate": 3.997553742337182e-06, + "loss": 0.68388849, + "num_input_tokens_seen": 8013365, + "step": 377, + "time_per_iteration": 2.742323875427246 + }, + { + "auxiliary_loss_clip": 0.01256523, + "auxiliary_loss_mlp": 0.01104238, + "balance_loss_clip": 1.07951474, + "balance_loss_mlp": 1.01525974, + "epoch": 0.045451812661576385, + "flos": 22163455791360.0, + "grad_norm": 2.502696853364906, + "language_loss": 0.912947, + "learning_rate": 3.997515074936949e-06, + "loss": 0.93655461, + "num_input_tokens_seen": 8034240, + "step": 378, + "time_per_iteration": 2.780550479888916 + }, + { + "auxiliary_loss_clip": 0.01256927, + "auxiliary_loss_mlp": 0.01107051, + "balance_loss_clip": 1.08007646, + "balance_loss_mlp": 1.01845515, + "epoch": 0.045572055552215475, + "flos": 16581968305920.0, + "grad_norm": 2.2030332038940483, + "language_loss": 0.86683851, + "learning_rate": 3.997476104515268e-06, + "loss": 0.89047831, + "num_input_tokens_seen": 8052430, + "step": 379, + "time_per_iteration": 2.7163197994232178 + }, + { + "auxiliary_loss_clip": 0.01254807, + "auxiliary_loss_mlp": 0.01101143, + "balance_loss_clip": 1.07796347, + "balance_loss_mlp": 1.01292825, + "epoch": 0.045692298442854565, + "flos": 17603375448960.0, + "grad_norm": 1.934439480723937, + "language_loss": 0.77496552, + "learning_rate": 3.9974368310780485e-06, + "loss": 0.79852509, + "num_input_tokens_seen": 8069605, + "step": 380, + "time_per_iteration": 2.7062501907348633 + }, + { + "auxiliary_loss_clip": 0.01258331, + "auxiliary_loss_mlp": 0.01108909, + "balance_loss_clip": 1.08110523, + "balance_loss_mlp": 1.01897717, + "epoch": 0.045812541333493655, + "flos": 26761098781440.0, + "grad_norm": 3.255261074988511, + "language_loss": 0.74645269, + "learning_rate": 3.997397254631251e-06, + "loss": 0.77012503, + "num_input_tokens_seen": 8090225, + "step": 381, + "time_per_iteration": 2.721203565597534 + }, + { + "auxiliary_loss_clip": 0.01260833, + "auxiliary_loss_mlp": 0.0108242, + "balance_loss_clip": 1.0916512, + "balance_loss_mlp": 1.00002289, + "epoch": 0.04593278422413275, + "flos": 60250349894400.0, + "grad_norm": 0.8015130189754298, + "language_loss": 0.60040987, + "learning_rate": 3.997357375180878e-06, + "loss": 0.62384248, + "num_input_tokens_seen": 8154505, + "step": 382, + "time_per_iteration": 3.405010461807251 + }, + { + "auxiliary_loss_clip": 0.01254632, + "auxiliary_loss_mlp": 0.01102152, + "balance_loss_clip": 1.07775307, + "balance_loss_mlp": 1.01398492, + "epoch": 0.04605302711477184, + "flos": 21799249839360.0, + "grad_norm": 1.9788746449115546, + "language_loss": 0.7526145, + "learning_rate": 3.997317192732979e-06, + "loss": 0.77618229, + "num_input_tokens_seen": 8173285, + "step": 383, + "time_per_iteration": 2.7098217010498047 + }, + { + "auxiliary_loss_clip": 0.01253766, + "auxiliary_loss_mlp": 0.01104937, + "balance_loss_clip": 1.07702613, + "balance_loss_mlp": 1.0161978, + "epoch": 0.04617327000541093, + "flos": 19459705299840.0, + "grad_norm": 1.7810080680165796, + "language_loss": 0.82574105, + "learning_rate": 3.99727670729365e-06, + "loss": 0.84932804, + "num_input_tokens_seen": 8191845, + "step": 384, + "time_per_iteration": 2.6537015438079834 + }, + { + "auxiliary_loss_clip": 0.01257631, + "auxiliary_loss_mlp": 0.01105199, + "balance_loss_clip": 1.08095574, + "balance_loss_mlp": 1.01679337, + "epoch": 0.04629351289605002, + "flos": 25411468135680.0, + "grad_norm": 1.825406296444417, + "language_loss": 0.77879846, + "learning_rate": 3.997235918869033e-06, + "loss": 0.80242676, + "num_input_tokens_seen": 8212880, + "step": 385, + "time_per_iteration": 2.7863481044769287 + }, + { + "auxiliary_loss_clip": 0.01255433, + "auxiliary_loss_mlp": 0.01100657, + "balance_loss_clip": 1.07905412, + "balance_loss_mlp": 1.01263261, + "epoch": 0.04641375578668911, + "flos": 20558284813440.0, + "grad_norm": 1.8145279587784073, + "language_loss": 0.82504904, + "learning_rate": 3.997194827465315e-06, + "loss": 0.84860992, + "num_input_tokens_seen": 8231475, + "step": 386, + "time_per_iteration": 2.7934257984161377 + }, + { + "auxiliary_loss_clip": 0.01251967, + "auxiliary_loss_mlp": 0.01103204, + "balance_loss_clip": 1.07632971, + "balance_loss_mlp": 1.0147506, + "epoch": 0.0465339986773282, + "flos": 13188661447680.0, + "grad_norm": 2.5944078824668546, + "language_loss": 0.91392481, + "learning_rate": 3.997153433088728e-06, + "loss": 0.93747652, + "num_input_tokens_seen": 8248600, + "step": 387, + "time_per_iteration": 2.7004799842834473 + }, + { + "auxiliary_loss_clip": 0.01255578, + "auxiliary_loss_mlp": 0.01101929, + "balance_loss_clip": 1.07936335, + "balance_loss_mlp": 1.01347566, + "epoch": 0.0466542415679673, + "flos": 25556547168000.0, + "grad_norm": 1.9061436639673812, + "language_loss": 0.81280077, + "learning_rate": 3.997111735745554e-06, + "loss": 0.83637583, + "num_input_tokens_seen": 8271570, + "step": 388, + "time_per_iteration": 2.7396843433380127 + }, + { + "auxiliary_loss_clip": 0.01253758, + "auxiliary_loss_mlp": 0.01103413, + "balance_loss_clip": 1.07780635, + "balance_loss_mlp": 1.01491225, + "epoch": 0.04677448445860639, + "flos": 22236749493120.0, + "grad_norm": 1.8649978584580182, + "language_loss": 0.82539272, + "learning_rate": 3.997069735442118e-06, + "loss": 0.84896445, + "num_input_tokens_seen": 8291265, + "step": 389, + "time_per_iteration": 2.6908857822418213 + }, + { + "auxiliary_loss_clip": 0.01250442, + "auxiliary_loss_mlp": 0.01105355, + "balance_loss_clip": 1.07500887, + "balance_loss_mlp": 1.01728356, + "epoch": 0.04689472734924548, + "flos": 28147825198080.0, + "grad_norm": 1.505351078547748, + "language_loss": 0.80358881, + "learning_rate": 3.997027432184792e-06, + "loss": 0.82714677, + "num_input_tokens_seen": 8315925, + "step": 390, + "time_per_iteration": 2.771481990814209 + }, + { + "auxiliary_loss_clip": 0.01254152, + "auxiliary_loss_mlp": 0.01102548, + "balance_loss_clip": 1.07812548, + "balance_loss_mlp": 1.01471484, + "epoch": 0.04701497023988457, + "flos": 23148952312320.0, + "grad_norm": 3.219136267970817, + "language_loss": 0.89547187, + "learning_rate": 3.99698482597999e-06, + "loss": 0.91903889, + "num_input_tokens_seen": 8333605, + "step": 391, + "time_per_iteration": 2.731519937515259 + }, + { + "auxiliary_loss_clip": 0.0125573, + "auxiliary_loss_mlp": 0.01082228, + "balance_loss_clip": 1.0882082, + "balance_loss_mlp": 1.00021255, + "epoch": 0.04713521313052366, + "flos": 64827668764800.0, + "grad_norm": 0.8982787666157134, + "language_loss": 0.64042985, + "learning_rate": 3.99694191683418e-06, + "loss": 0.66380942, + "num_input_tokens_seen": 8394405, + "step": 392, + "time_per_iteration": 3.285191535949707 + }, + { + "auxiliary_loss_clip": 0.01254968, + "auxiliary_loss_mlp": 0.01104163, + "balance_loss_clip": 1.07947111, + "balance_loss_mlp": 1.01551867, + "epoch": 0.047255456021162746, + "flos": 18771585477120.0, + "grad_norm": 1.8978554641620895, + "language_loss": 0.81786191, + "learning_rate": 3.996898704753867e-06, + "loss": 0.84145319, + "num_input_tokens_seen": 8412355, + "step": 393, + "time_per_iteration": 2.7011635303497314 + }, + { + "auxiliary_loss_clip": 0.01251236, + "auxiliary_loss_mlp": 0.01098426, + "balance_loss_clip": 1.07557333, + "balance_loss_mlp": 1.01092672, + "epoch": 0.04737569891180184, + "flos": 22053820504320.0, + "grad_norm": 2.398261548309884, + "language_loss": 0.87789202, + "learning_rate": 3.996855189745609e-06, + "loss": 0.90138865, + "num_input_tokens_seen": 8431620, + "step": 394, + "time_per_iteration": 2.6951279640197754 + }, + { + "auxiliary_loss_clip": 0.01248897, + "auxiliary_loss_mlp": 0.01103699, + "balance_loss_clip": 1.07319248, + "balance_loss_mlp": 1.0160563, + "epoch": 0.04749594180244093, + "flos": 29057370410880.0, + "grad_norm": 2.370093037910965, + "language_loss": 0.92452109, + "learning_rate": 3.996811371816007e-06, + "loss": 0.9480471, + "num_input_tokens_seen": 8454045, + "step": 395, + "time_per_iteration": 2.6841752529144287 + }, + { + "auxiliary_loss_clip": 0.01251283, + "auxiliary_loss_mlp": 0.01103196, + "balance_loss_clip": 1.07606459, + "balance_loss_mlp": 1.01574397, + "epoch": 0.04761618469308002, + "flos": 35112268172160.0, + "grad_norm": 1.8906873847598622, + "language_loss": 0.77962899, + "learning_rate": 3.996767250971707e-06, + "loss": 0.80317384, + "num_input_tokens_seen": 8476785, + "step": 396, + "time_per_iteration": 2.8429768085479736 + }, + { + "auxiliary_loss_clip": 0.01253793, + "auxiliary_loss_mlp": 0.01103403, + "balance_loss_clip": 1.07901788, + "balance_loss_mlp": 1.0157609, + "epoch": 0.04773642758371911, + "flos": 25630702796160.0, + "grad_norm": 1.8648709276314959, + "language_loss": 0.8711434, + "learning_rate": 3.996722827219403e-06, + "loss": 0.89471537, + "num_input_tokens_seen": 8498400, + "step": 397, + "time_per_iteration": 2.685746431350708 + }, + { + "auxiliary_loss_clip": 0.01250352, + "auxiliary_loss_mlp": 0.01101186, + "balance_loss_clip": 1.07572699, + "balance_loss_mlp": 1.01311421, + "epoch": 0.0478566704743582, + "flos": 20631506688000.0, + "grad_norm": 2.608687771625877, + "language_loss": 0.82537889, + "learning_rate": 3.996678100565833e-06, + "loss": 0.84889424, + "num_input_tokens_seen": 8517455, + "step": 398, + "time_per_iteration": 2.7697484493255615 + }, + { + "auxiliary_loss_clip": 0.01250034, + "auxiliary_loss_mlp": 0.01101482, + "balance_loss_clip": 1.07592428, + "balance_loss_mlp": 1.01383972, + "epoch": 0.04797691336499729, + "flos": 18835721210880.0, + "grad_norm": 2.204458472711422, + "language_loss": 0.88532209, + "learning_rate": 3.996633071017783e-06, + "loss": 0.90883726, + "num_input_tokens_seen": 8534085, + "step": 399, + "time_per_iteration": 3.6269657611846924 + }, + { + "auxiliary_loss_clip": 0.0125148, + "auxiliary_loss_mlp": 0.01105891, + "balance_loss_clip": 1.07715559, + "balance_loss_mlp": 1.01739049, + "epoch": 0.04809715625563638, + "flos": 21099673578240.0, + "grad_norm": 2.5524690194361246, + "language_loss": 0.81698275, + "learning_rate": 3.996587738582084e-06, + "loss": 0.8405565, + "num_input_tokens_seen": 8550885, + "step": 400, + "time_per_iteration": 5.5924904346466064 + }, + { + "auxiliary_loss_clip": 0.01248127, + "auxiliary_loss_mlp": 0.01104064, + "balance_loss_clip": 1.07359028, + "balance_loss_mlp": 1.01613474, + "epoch": 0.04821739914627548, + "flos": 23805650712960.0, + "grad_norm": 2.836466808060065, + "language_loss": 0.86088753, + "learning_rate": 3.9965421032656115e-06, + "loss": 0.88440943, + "num_input_tokens_seen": 8570815, + "step": 401, + "time_per_iteration": 2.779174566268921 + }, + { + "auxiliary_loss_clip": 0.01249546, + "auxiliary_loss_mlp": 0.01103725, + "balance_loss_clip": 1.07517028, + "balance_loss_mlp": 1.01551044, + "epoch": 0.04833764203691457, + "flos": 22200587475840.0, + "grad_norm": 2.866261478285999, + "language_loss": 0.94251603, + "learning_rate": 3.99649616507529e-06, + "loss": 0.96604878, + "num_input_tokens_seen": 8589910, + "step": 402, + "time_per_iteration": 2.810283899307251 + }, + { + "auxiliary_loss_clip": 0.01248814, + "auxiliary_loss_mlp": 0.01082149, + "balance_loss_clip": 1.08314276, + "balance_loss_mlp": 1.0001328, + "epoch": 0.04845788492755366, + "flos": 65904376896000.0, + "grad_norm": 0.8983600547558007, + "language_loss": 0.6324501, + "learning_rate": 3.996449924018088e-06, + "loss": 0.65575969, + "num_input_tokens_seen": 8650370, + "step": 403, + "time_per_iteration": 3.191441297531128 + }, + { + "auxiliary_loss_clip": 0.01248207, + "auxiliary_loss_mlp": 0.01104999, + "balance_loss_clip": 1.0742023, + "balance_loss_mlp": 1.01749957, + "epoch": 0.04857812781819275, + "flos": 19281301424640.0, + "grad_norm": 1.7986893843327172, + "language_loss": 0.7958011, + "learning_rate": 3.99640338010102e-06, + "loss": 0.8193332, + "num_input_tokens_seen": 8669475, + "step": 404, + "time_per_iteration": 2.751605987548828 + }, + { + "auxiliary_loss_clip": 0.01248999, + "auxiliary_loss_mlp": 0.01102207, + "balance_loss_clip": 1.07466555, + "balance_loss_mlp": 1.014135, + "epoch": 0.04869837070883184, + "flos": 24062376193920.0, + "grad_norm": 6.19335614528935, + "language_loss": 0.7899462, + "learning_rate": 3.996356533331146e-06, + "loss": 0.81345826, + "num_input_tokens_seen": 8691345, + "step": 405, + "time_per_iteration": 2.786191463470459 + }, + { + "auxiliary_loss_clip": 0.01247362, + "auxiliary_loss_mlp": 0.01100847, + "balance_loss_clip": 1.0734812, + "balance_loss_mlp": 1.01263201, + "epoch": 0.04881861359947093, + "flos": 25187169657600.0, + "grad_norm": 2.3574166418580655, + "language_loss": 0.62001848, + "learning_rate": 3.996309383715573e-06, + "loss": 0.64350057, + "num_input_tokens_seen": 8710125, + "step": 406, + "time_per_iteration": 2.7754151821136475 + }, + { + "auxiliary_loss_clip": 0.01250395, + "auxiliary_loss_mlp": 0.0110233, + "balance_loss_clip": 1.07639933, + "balance_loss_mlp": 1.01406741, + "epoch": 0.048938856490110025, + "flos": 16362913213440.0, + "grad_norm": 2.6418288729408665, + "language_loss": 0.73852289, + "learning_rate": 3.996261931261454e-06, + "loss": 0.76205009, + "num_input_tokens_seen": 8728705, + "step": 407, + "time_per_iteration": 2.7267251014709473 + }, + { + "auxiliary_loss_clip": 0.01250785, + "auxiliary_loss_mlp": 0.01102191, + "balance_loss_clip": 1.07702076, + "balance_loss_mlp": 1.01469135, + "epoch": 0.049059099380749115, + "flos": 29895094379520.0, + "grad_norm": 1.914946780152842, + "language_loss": 0.86531591, + "learning_rate": 3.996214175975987e-06, + "loss": 0.88884568, + "num_input_tokens_seen": 8749225, + "step": 408, + "time_per_iteration": 2.7828638553619385 + }, + { + "auxiliary_loss_clip": 0.01251648, + "auxiliary_loss_mlp": 0.01104959, + "balance_loss_clip": 1.07777596, + "balance_loss_mlp": 1.01660132, + "epoch": 0.049179342271388204, + "flos": 35918858027520.0, + "grad_norm": 2.179254103231119, + "language_loss": 0.79015481, + "learning_rate": 3.996166117866417e-06, + "loss": 0.81372094, + "num_input_tokens_seen": 8771160, + "step": 409, + "time_per_iteration": 2.92042875289917 + }, + { + "auxiliary_loss_clip": 0.0124602, + "auxiliary_loss_mlp": 0.0110095, + "balance_loss_clip": 1.07291055, + "balance_loss_mlp": 1.01335526, + "epoch": 0.049299585162027294, + "flos": 14611226659200.0, + "grad_norm": 1.9702775916817483, + "language_loss": 0.86793858, + "learning_rate": 3.996117756940035e-06, + "loss": 0.89140826, + "num_input_tokens_seen": 8787845, + "step": 410, + "time_per_iteration": 2.7123193740844727 + }, + { + "auxiliary_loss_clip": 0.01248547, + "auxiliary_loss_mlp": 0.01102374, + "balance_loss_clip": 1.07539856, + "balance_loss_mlp": 1.0148263, + "epoch": 0.049419828052666384, + "flos": 19567939956480.0, + "grad_norm": 2.0443158823751606, + "language_loss": 0.97552264, + "learning_rate": 3.996069093204175e-06, + "loss": 0.9990319, + "num_input_tokens_seen": 8803805, + "step": 411, + "time_per_iteration": 2.708770275115967 + }, + { + "auxiliary_loss_clip": 0.01249299, + "auxiliary_loss_mlp": 0.01105757, + "balance_loss_clip": 1.07638836, + "balance_loss_mlp": 1.01763797, + "epoch": 0.049540070943305474, + "flos": 13659916907520.0, + "grad_norm": 2.4090944539467305, + "language_loss": 0.88202423, + "learning_rate": 3.996020126666221e-06, + "loss": 0.9055748, + "num_input_tokens_seen": 8820785, + "step": 412, + "time_per_iteration": 2.731743097305298 + }, + { + "auxiliary_loss_clip": 0.01247769, + "auxiliary_loss_mlp": 0.01102048, + "balance_loss_clip": 1.07469666, + "balance_loss_mlp": 1.01392817, + "epoch": 0.04966031383394457, + "flos": 21832035978240.0, + "grad_norm": 2.1365786124672286, + "language_loss": 0.82105672, + "learning_rate": 3.995970857333601e-06, + "loss": 0.8445549, + "num_input_tokens_seen": 8841195, + "step": 413, + "time_per_iteration": 2.815535068511963 + }, + { + "auxiliary_loss_clip": 0.01247887, + "auxiliary_loss_mlp": 0.01105588, + "balance_loss_clip": 1.07451761, + "balance_loss_mlp": 1.01780283, + "epoch": 0.04978055672458366, + "flos": 28618793349120.0, + "grad_norm": 2.324381155786535, + "language_loss": 0.79501486, + "learning_rate": 3.995921285213789e-06, + "loss": 0.81854963, + "num_input_tokens_seen": 8861455, + "step": 414, + "time_per_iteration": 2.810851573944092 + }, + { + "auxiliary_loss_clip": 0.01249829, + "auxiliary_loss_mlp": 0.0110242, + "balance_loss_clip": 1.07703376, + "balance_loss_mlp": 1.01458645, + "epoch": 0.04990079961522275, + "flos": 19828220883840.0, + "grad_norm": 2.562108778403272, + "language_loss": 0.8091923, + "learning_rate": 3.995871410314305e-06, + "loss": 0.8327148, + "num_input_tokens_seen": 8880015, + "step": 415, + "time_per_iteration": 2.7188432216644287 + }, + { + "auxiliary_loss_clip": 0.01234981, + "auxiliary_loss_mlp": 0.01082046, + "balance_loss_clip": 1.07934523, + "balance_loss_mlp": 1.00003004, + "epoch": 0.05002104250586184, + "flos": 62735045293440.0, + "grad_norm": 0.8971681665564915, + "language_loss": 0.59651983, + "learning_rate": 3.995821232642714e-06, + "loss": 0.61969006, + "num_input_tokens_seen": 8938420, + "step": 416, + "time_per_iteration": 3.355717182159424 + }, + { + "auxiliary_loss_clip": 0.01231575, + "auxiliary_loss_mlp": 0.01095547, + "balance_loss_clip": 1.07238472, + "balance_loss_mlp": 1.00861979, + "epoch": 0.05014128539650093, + "flos": 27928518710400.0, + "grad_norm": 2.876839045487386, + "language_loss": 0.82131273, + "learning_rate": 3.995770752206629e-06, + "loss": 0.84458393, + "num_input_tokens_seen": 8959495, + "step": 417, + "time_per_iteration": 2.942729949951172 + }, + { + "auxiliary_loss_clip": 0.01247665, + "auxiliary_loss_mlp": 0.01099473, + "balance_loss_clip": 1.07530856, + "balance_loss_mlp": 1.01202154, + "epoch": 0.05026152828714002, + "flos": 17705576620800.0, + "grad_norm": 2.0130708722146475, + "language_loss": 0.97256756, + "learning_rate": 3.995719969013709e-06, + "loss": 0.99603897, + "num_input_tokens_seen": 8976675, + "step": 418, + "time_per_iteration": 2.747615098953247 + }, + { + "auxiliary_loss_clip": 0.01220593, + "auxiliary_loss_mlp": 0.01098169, + "balance_loss_clip": 1.06996071, + "balance_loss_mlp": 1.01038313, + "epoch": 0.05038177117777912, + "flos": 19133277477120.0, + "grad_norm": 2.7584379603244686, + "language_loss": 0.85497034, + "learning_rate": 3.995668883071655e-06, + "loss": 0.87815803, + "num_input_tokens_seen": 8992900, + "step": 419, + "time_per_iteration": 2.763664960861206 + }, + { + "auxiliary_loss_clip": 0.01247466, + "auxiliary_loss_mlp": 0.01101984, + "balance_loss_clip": 1.07510042, + "balance_loss_mlp": 1.01391268, + "epoch": 0.050502014068418206, + "flos": 20667704618880.0, + "grad_norm": 3.7402513178806926, + "language_loss": 0.90916753, + "learning_rate": 3.995617494388219e-06, + "loss": 0.93266201, + "num_input_tokens_seen": 9011020, + "step": 420, + "time_per_iteration": 2.7723588943481445 + }, + { + "auxiliary_loss_clip": 0.01220355, + "auxiliary_loss_mlp": 0.01101118, + "balance_loss_clip": 1.06644177, + "balance_loss_mlp": 1.01304603, + "epoch": 0.050622256959057296, + "flos": 21361103740800.0, + "grad_norm": 2.040894696864362, + "language_loss": 0.80694443, + "learning_rate": 3.995565802971196e-06, + "loss": 0.83015913, + "num_input_tokens_seen": 9030995, + "step": 421, + "time_per_iteration": 2.853689670562744 + }, + { + "auxiliary_loss_clip": 0.01223895, + "auxiliary_loss_mlp": 0.01100932, + "balance_loss_clip": 1.07081223, + "balance_loss_mlp": 1.01328945, + "epoch": 0.050742499849696386, + "flos": 27673588909440.0, + "grad_norm": 1.9705969449520269, + "language_loss": 0.67481196, + "learning_rate": 3.995513808828427e-06, + "loss": 0.69806021, + "num_input_tokens_seen": 9053790, + "step": 422, + "time_per_iteration": 2.807149887084961 + }, + { + "auxiliary_loss_clip": 0.01222905, + "auxiliary_loss_mlp": 0.01101089, + "balance_loss_clip": 1.07037187, + "balance_loss_mlp": 1.01373219, + "epoch": 0.050862742740335476, + "flos": 19865999013120.0, + "grad_norm": 2.0141999661442087, + "language_loss": 0.76783192, + "learning_rate": 3.9954615119678e-06, + "loss": 0.79107183, + "num_input_tokens_seen": 9072345, + "step": 423, + "time_per_iteration": 2.7854180335998535 + }, + { + "auxiliary_loss_clip": 0.01232973, + "auxiliary_loss_mlp": 0.0109984, + "balance_loss_clip": 1.07083368, + "balance_loss_mlp": 1.01214993, + "epoch": 0.050982985630974566, + "flos": 22085098272000.0, + "grad_norm": 2.035072599626371, + "language_loss": 0.80687302, + "learning_rate": 3.995408912397248e-06, + "loss": 0.83020115, + "num_input_tokens_seen": 9090240, + "step": 424, + "time_per_iteration": 3.7038075923919678 + }, + { + "auxiliary_loss_clip": 0.01218407, + "auxiliary_loss_mlp": 0.01100133, + "balance_loss_clip": 1.06690836, + "balance_loss_mlp": 1.01210856, + "epoch": 0.05110322852161366, + "flos": 20740962407040.0, + "grad_norm": 2.3645572026055546, + "language_loss": 0.9303323, + "learning_rate": 3.99535601012475e-06, + "loss": 0.95351762, + "num_input_tokens_seen": 9105570, + "step": 425, + "time_per_iteration": 4.67680811882019 + }, + { + "auxiliary_loss_clip": 0.01213474, + "auxiliary_loss_mlp": 0.0087701, + "balance_loss_clip": 1.06996036, + "balance_loss_mlp": 1.001302, + "epoch": 0.05122347141225275, + "flos": 28547295327360.0, + "grad_norm": 1.5069264123008463, + "language_loss": 0.75613546, + "learning_rate": 3.995302805158333e-06, + "loss": 0.77704024, + "num_input_tokens_seen": 9128225, + "step": 426, + "time_per_iteration": 3.8378665447235107 + }, + { + "auxiliary_loss_clip": 0.01219517, + "auxiliary_loss_mlp": 0.01104699, + "balance_loss_clip": 1.06807804, + "balance_loss_mlp": 1.01624525, + "epoch": 0.05134371430289184, + "flos": 19722679747200.0, + "grad_norm": 1.8110302799550773, + "language_loss": 0.83509719, + "learning_rate": 3.9952492975060665e-06, + "loss": 0.85833931, + "num_input_tokens_seen": 9148295, + "step": 427, + "time_per_iteration": 2.8007562160491943 + }, + { + "auxiliary_loss_clip": 0.01230006, + "auxiliary_loss_mlp": 0.01101084, + "balance_loss_clip": 1.06971097, + "balance_loss_mlp": 1.01286948, + "epoch": 0.05146395719353093, + "flos": 34458945649920.0, + "grad_norm": 2.5840859836873196, + "language_loss": 0.85262311, + "learning_rate": 3.995195487176067e-06, + "loss": 0.875934, + "num_input_tokens_seen": 9168525, + "step": 428, + "time_per_iteration": 2.8520443439483643 + }, + { + "auxiliary_loss_clip": 0.01243087, + "auxiliary_loss_mlp": 0.01104737, + "balance_loss_clip": 1.07181013, + "balance_loss_mlp": 1.01699877, + "epoch": 0.05158420008417002, + "flos": 21760286561280.0, + "grad_norm": 1.8429666771233795, + "language_loss": 0.85203022, + "learning_rate": 3.995141374176499e-06, + "loss": 0.87550843, + "num_input_tokens_seen": 9186920, + "step": 429, + "time_per_iteration": 2.7333812713623047 + }, + { + "auxiliary_loss_clip": 0.01213099, + "auxiliary_loss_mlp": 0.00875717, + "balance_loss_clip": 1.0760783, + "balance_loss_mlp": 1.0011282, + "epoch": 0.05170444297480911, + "flos": 72553956226560.0, + "grad_norm": 0.9456923679062188, + "language_loss": 0.63231313, + "learning_rate": 3.995086958515572e-06, + "loss": 0.65320134, + "num_input_tokens_seen": 9244940, + "step": 430, + "time_per_iteration": 3.3384323120117188 + }, + { + "auxiliary_loss_clip": 0.01239858, + "auxiliary_loss_mlp": 0.00875816, + "balance_loss_clip": 1.07826352, + "balance_loss_mlp": 1.00123596, + "epoch": 0.05182468586544821, + "flos": 62416159326720.0, + "grad_norm": 0.8662901667425814, + "language_loss": 0.59993839, + "learning_rate": 3.995032240201538e-06, + "loss": 0.62109518, + "num_input_tokens_seen": 9307335, + "step": 431, + "time_per_iteration": 3.2282373905181885 + }, + { + "auxiliary_loss_clip": 0.01213977, + "auxiliary_loss_mlp": 0.01082022, + "balance_loss_clip": 1.07075834, + "balance_loss_mlp": 1.00038743, + "epoch": 0.0519449287560873, + "flos": 41225989432320.0, + "grad_norm": 0.934075067252116, + "language_loss": 0.63130713, + "learning_rate": 3.9949772192427e-06, + "loss": 0.65426719, + "num_input_tokens_seen": 9353960, + "step": 432, + "time_per_iteration": 3.047874689102173 + }, + { + "auxiliary_loss_clip": 0.01220154, + "auxiliary_loss_mlp": 0.01100922, + "balance_loss_clip": 1.06936276, + "balance_loss_mlp": 1.01323152, + "epoch": 0.05206517164672639, + "flos": 17494530261120.0, + "grad_norm": 1.8707727648073844, + "language_loss": 0.79665595, + "learning_rate": 3.994921895647405e-06, + "loss": 0.81986672, + "num_input_tokens_seen": 9372130, + "step": 433, + "time_per_iteration": 2.7364909648895264 + }, + { + "auxiliary_loss_clip": 0.01237064, + "auxiliary_loss_mlp": 0.0108177, + "balance_loss_clip": 1.07626748, + "balance_loss_mlp": 1.0001353, + "epoch": 0.05218541453736548, + "flos": 64002762973440.0, + "grad_norm": 0.8459536005087192, + "language_loss": 0.55353892, + "learning_rate": 3.994866269424043e-06, + "loss": 0.57672727, + "num_input_tokens_seen": 9428500, + "step": 434, + "time_per_iteration": 3.2223803997039795 + }, + { + "auxiliary_loss_clip": 0.01182272, + "auxiliary_loss_mlp": 0.01098643, + "balance_loss_clip": 1.05790758, + "balance_loss_mlp": 1.01085734, + "epoch": 0.05230565742800457, + "flos": 19317319787520.0, + "grad_norm": 2.1867321767435026, + "language_loss": 0.78397369, + "learning_rate": 3.9948103405810545e-06, + "loss": 0.80678284, + "num_input_tokens_seen": 9447450, + "step": 435, + "time_per_iteration": 2.8943753242492676 + }, + { + "auxiliary_loss_clip": 0.01199348, + "auxiliary_loss_mlp": 0.01100724, + "balance_loss_clip": 1.05654538, + "balance_loss_mlp": 1.01331949, + "epoch": 0.05242590031864366, + "flos": 25298636538240.0, + "grad_norm": 1.9659533839330863, + "language_loss": 0.86205244, + "learning_rate": 3.994754109126923e-06, + "loss": 0.88505316, + "num_input_tokens_seen": 9468945, + "step": 436, + "time_per_iteration": 2.90152645111084 + }, + { + "auxiliary_loss_clip": 0.01181605, + "auxiliary_loss_mlp": 0.01100362, + "balance_loss_clip": 1.06030035, + "balance_loss_mlp": 1.0131005, + "epoch": 0.052546143209282754, + "flos": 26211629456640.0, + "grad_norm": 2.365172997537715, + "language_loss": 0.93526757, + "learning_rate": 3.994697575070181e-06, + "loss": 0.95808721, + "num_input_tokens_seen": 9488405, + "step": 437, + "time_per_iteration": 2.977996349334717 + }, + { + "auxiliary_loss_clip": 0.01217437, + "auxiliary_loss_mlp": 0.01103036, + "balance_loss_clip": 1.0694077, + "balance_loss_mlp": 1.01553607, + "epoch": 0.052666386099921844, + "flos": 22158140578560.0, + "grad_norm": 1.8775662306504326, + "language_loss": 0.91356027, + "learning_rate": 3.994640738419402e-06, + "loss": 0.93676502, + "num_input_tokens_seen": 9507780, + "step": 438, + "time_per_iteration": 2.7757391929626465 + }, + { + "auxiliary_loss_clip": 0.01230486, + "auxiliary_loss_mlp": 0.01097747, + "balance_loss_clip": 1.06955481, + "balance_loss_mlp": 1.01062846, + "epoch": 0.052786628990560934, + "flos": 23881817502720.0, + "grad_norm": 2.313333554092958, + "language_loss": 0.80804086, + "learning_rate": 3.9945835991832075e-06, + "loss": 0.83132327, + "num_input_tokens_seen": 9529665, + "step": 439, + "time_per_iteration": 2.8955233097076416 + }, + { + "auxiliary_loss_clip": 0.01243379, + "auxiliary_loss_mlp": 0.01101951, + "balance_loss_clip": 1.07337236, + "balance_loss_mlp": 1.01464248, + "epoch": 0.052906871881200024, + "flos": 24605021934720.0, + "grad_norm": 2.1508577440981678, + "language_loss": 0.93050826, + "learning_rate": 3.994526157370268e-06, + "loss": 0.95396155, + "num_input_tokens_seen": 9548280, + "step": 440, + "time_per_iteration": 2.7999024391174316 + }, + { + "auxiliary_loss_clip": 0.01214549, + "auxiliary_loss_mlp": 0.01081811, + "balance_loss_clip": 1.07124233, + "balance_loss_mlp": 1.00017643, + "epoch": 0.053027114771839114, + "flos": 56461631143680.0, + "grad_norm": 0.9139426330025737, + "language_loss": 0.59318125, + "learning_rate": 3.994468412989296e-06, + "loss": 0.6161449, + "num_input_tokens_seen": 9609690, + "step": 441, + "time_per_iteration": 3.4410319328308105 + }, + { + "auxiliary_loss_clip": 0.01208726, + "auxiliary_loss_mlp": 0.01099653, + "balance_loss_clip": 1.06467426, + "balance_loss_mlp": 1.01272547, + "epoch": 0.053147357662478203, + "flos": 17311098481920.0, + "grad_norm": 2.0987220626480223, + "language_loss": 0.928666, + "learning_rate": 3.994410366049052e-06, + "loss": 0.95174968, + "num_input_tokens_seen": 9627550, + "step": 442, + "time_per_iteration": 2.854323148727417 + }, + { + "auxiliary_loss_clip": 0.0122391, + "auxiliary_loss_mlp": 0.01102762, + "balance_loss_clip": 1.06798792, + "balance_loss_mlp": 1.01507151, + "epoch": 0.0532676005531173, + "flos": 17164977955200.0, + "grad_norm": 2.03770554113451, + "language_loss": 0.82923824, + "learning_rate": 3.994352016558341e-06, + "loss": 0.85250491, + "num_input_tokens_seen": 9644855, + "step": 443, + "time_per_iteration": 2.7222635746002197 + }, + { + "auxiliary_loss_clip": 0.01228648, + "auxiliary_loss_mlp": 0.01100446, + "balance_loss_clip": 1.07023621, + "balance_loss_mlp": 1.01318526, + "epoch": 0.05338784344375639, + "flos": 27819960831360.0, + "grad_norm": 4.274657780570931, + "language_loss": 0.73849547, + "learning_rate": 3.994293364526014e-06, + "loss": 0.7617864, + "num_input_tokens_seen": 9665740, + "step": 444, + "time_per_iteration": 2.855788230895996 + }, + { + "auxiliary_loss_clip": 0.01219996, + "auxiliary_loss_mlp": 0.01100458, + "balance_loss_clip": 1.06980526, + "balance_loss_mlp": 1.0131017, + "epoch": 0.05350808633439548, + "flos": 21507691144320.0, + "grad_norm": 2.176864128511704, + "language_loss": 0.8511219, + "learning_rate": 3.99423440996097e-06, + "loss": 0.87432647, + "num_input_tokens_seen": 9685280, + "step": 445, + "time_per_iteration": 2.848830461502075 + }, + { + "auxiliary_loss_clip": 0.01212446, + "auxiliary_loss_mlp": 0.01103469, + "balance_loss_clip": 1.06375778, + "balance_loss_mlp": 1.01625514, + "epoch": 0.05362832922503457, + "flos": 20084299920000.0, + "grad_norm": 2.8557295865358516, + "language_loss": 0.81726778, + "learning_rate": 3.994175152872152e-06, + "loss": 0.84042692, + "num_input_tokens_seen": 9704365, + "step": 446, + "time_per_iteration": 2.848353385925293 + }, + { + "auxiliary_loss_clip": 0.01230327, + "auxiliary_loss_mlp": 0.01101945, + "balance_loss_clip": 1.06931269, + "balance_loss_mlp": 1.01449323, + "epoch": 0.05374857211567366, + "flos": 26137222433280.0, + "grad_norm": 1.8673674114088785, + "language_loss": 0.7860356, + "learning_rate": 3.994115593268548e-06, + "loss": 0.8093583, + "num_input_tokens_seen": 9724145, + "step": 447, + "time_per_iteration": 2.7467548847198486 + }, + { + "auxiliary_loss_clip": 0.01241968, + "auxiliary_loss_mlp": 0.01099634, + "balance_loss_clip": 1.07267618, + "balance_loss_mlp": 1.01246822, + "epoch": 0.05386881500631275, + "flos": 27486817165440.0, + "grad_norm": 2.2030083176166713, + "language_loss": 0.82304913, + "learning_rate": 3.994055731159195e-06, + "loss": 0.84646523, + "num_input_tokens_seen": 9741615, + "step": 448, + "time_per_iteration": 2.755110025405884 + }, + { + "auxiliary_loss_clip": 0.01232519, + "auxiliary_loss_mlp": 0.01103009, + "balance_loss_clip": 1.07201457, + "balance_loss_mlp": 1.01574731, + "epoch": 0.053989057896951846, + "flos": 23585087249280.0, + "grad_norm": 1.8154340611576014, + "language_loss": 0.86797053, + "learning_rate": 3.993995566553172e-06, + "loss": 0.89132577, + "num_input_tokens_seen": 9760580, + "step": 449, + "time_per_iteration": 3.668578863143921 + }, + { + "auxiliary_loss_clip": 0.01208549, + "auxiliary_loss_mlp": 0.01101562, + "balance_loss_clip": 1.06511045, + "balance_loss_mlp": 1.01391912, + "epoch": 0.054109300787590936, + "flos": 25228862369280.0, + "grad_norm": 1.5413670298461752, + "language_loss": 0.77075452, + "learning_rate": 3.993935099459607e-06, + "loss": 0.79385561, + "num_input_tokens_seen": 9782195, + "step": 450, + "time_per_iteration": 3.640817642211914 + }, + { + "auxiliary_loss_clip": 0.01237199, + "auxiliary_loss_mlp": 0.0110282, + "balance_loss_clip": 1.06861186, + "balance_loss_mlp": 1.01584542, + "epoch": 0.054229543678230026, + "flos": 23841525421440.0, + "grad_norm": 1.9267991590360707, + "language_loss": 0.74140084, + "learning_rate": 3.993874329887673e-06, + "loss": 0.76480103, + "num_input_tokens_seen": 9800850, + "step": 451, + "time_per_iteration": 3.6466689109802246 + }, + { + "auxiliary_loss_clip": 0.01229865, + "auxiliary_loss_mlp": 0.01103131, + "balance_loss_clip": 1.0714581, + "balance_loss_mlp": 1.01577401, + "epoch": 0.054349786568869116, + "flos": 16320933192960.0, + "grad_norm": 2.6431535079887953, + "language_loss": 0.86460578, + "learning_rate": 3.993813257846589e-06, + "loss": 0.8879357, + "num_input_tokens_seen": 9817605, + "step": 452, + "time_per_iteration": 3.6469733715057373 + }, + { + "auxiliary_loss_clip": 0.01229926, + "auxiliary_loss_mlp": 0.01103779, + "balance_loss_clip": 1.07042956, + "balance_loss_mlp": 1.01604116, + "epoch": 0.054470029459508205, + "flos": 18660729127680.0, + "grad_norm": 2.770176210225419, + "language_loss": 0.92776579, + "learning_rate": 3.993751883345619e-06, + "loss": 0.95110285, + "num_input_tokens_seen": 9835965, + "step": 453, + "time_per_iteration": 2.7469980716705322 + }, + { + "auxiliary_loss_clip": 0.01209651, + "auxiliary_loss_mlp": 0.0110531, + "balance_loss_clip": 1.06190121, + "balance_loss_mlp": 1.0174768, + "epoch": 0.054590272350147295, + "flos": 17785298856960.0, + "grad_norm": 2.992581984888631, + "language_loss": 0.87722248, + "learning_rate": 3.993690206394073e-06, + "loss": 0.90037209, + "num_input_tokens_seen": 9852265, + "step": 454, + "time_per_iteration": 2.772250175476074 + }, + { + "auxiliary_loss_clip": 0.01214145, + "auxiliary_loss_mlp": 0.01099019, + "balance_loss_clip": 1.06459403, + "balance_loss_mlp": 1.01194906, + "epoch": 0.054710515240786385, + "flos": 17785945301760.0, + "grad_norm": 3.042546435885734, + "language_loss": 0.87658834, + "learning_rate": 3.993628227001307e-06, + "loss": 0.89972001, + "num_input_tokens_seen": 9870465, + "step": 455, + "time_per_iteration": 2.8018791675567627 + }, + { + "auxiliary_loss_clip": 0.01214623, + "auxiliary_loss_mlp": 0.01102691, + "balance_loss_clip": 1.06664252, + "balance_loss_mlp": 1.01514387, + "epoch": 0.05483075813142548, + "flos": 48210900180480.0, + "grad_norm": 1.9820659242559413, + "language_loss": 0.71436298, + "learning_rate": 3.993565945176726e-06, + "loss": 0.73753607, + "num_input_tokens_seen": 9891490, + "step": 456, + "time_per_iteration": 2.995931386947632 + }, + { + "auxiliary_loss_clip": 0.01201475, + "auxiliary_loss_mlp": 0.01099952, + "balance_loss_clip": 1.0553875, + "balance_loss_mlp": 1.01331079, + "epoch": 0.05495100102206457, + "flos": 19682244011520.0, + "grad_norm": 1.9930005676470222, + "language_loss": 0.84221381, + "learning_rate": 3.993503360929776e-06, + "loss": 0.86522806, + "num_input_tokens_seen": 9910375, + "step": 457, + "time_per_iteration": 2.7575886249542236 + }, + { + "auxiliary_loss_clip": 0.01158501, + "auxiliary_loss_mlp": 0.01103879, + "balance_loss_clip": 1.05511487, + "balance_loss_mlp": 1.01652288, + "epoch": 0.05507124391270366, + "flos": 26360048453760.0, + "grad_norm": 1.7191262096653082, + "language_loss": 0.81053966, + "learning_rate": 3.99344047426995e-06, + "loss": 0.8331635, + "num_input_tokens_seen": 9931635, + "step": 458, + "time_per_iteration": 3.205861806869507 + }, + { + "auxiliary_loss_clip": 0.01181546, + "auxiliary_loss_mlp": 0.0110055, + "balance_loss_clip": 1.0555985, + "balance_loss_mlp": 1.01295519, + "epoch": 0.05519148680334275, + "flos": 22601314581120.0, + "grad_norm": 2.8035272461009075, + "language_loss": 0.93720257, + "learning_rate": 3.993377285206789e-06, + "loss": 0.96002352, + "num_input_tokens_seen": 9951420, + "step": 459, + "time_per_iteration": 3.2570269107818604 + }, + { + "auxiliary_loss_clip": 0.01193865, + "auxiliary_loss_mlp": 0.01102801, + "balance_loss_clip": 1.06199861, + "balance_loss_mlp": 1.01534915, + "epoch": 0.05531172969398184, + "flos": 40552519380480.0, + "grad_norm": 1.6656404275540257, + "language_loss": 0.86422539, + "learning_rate": 3.99331379374988e-06, + "loss": 0.88719201, + "num_input_tokens_seen": 9975025, + "step": 460, + "time_per_iteration": 2.998298406600952 + }, + { + "auxiliary_loss_clip": 0.01220112, + "auxiliary_loss_mlp": 0.01098205, + "balance_loss_clip": 1.06841981, + "balance_loss_mlp": 1.01127791, + "epoch": 0.05543197258462093, + "flos": 23477894087040.0, + "grad_norm": 1.9034595715873617, + "language_loss": 0.79958153, + "learning_rate": 3.993249999908852e-06, + "loss": 0.82276469, + "num_input_tokens_seen": 9995175, + "step": 461, + "time_per_iteration": 2.804617166519165 + }, + { + "auxiliary_loss_clip": 0.01237359, + "auxiliary_loss_mlp": 0.01100211, + "balance_loss_clip": 1.06854367, + "balance_loss_mlp": 1.01295018, + "epoch": 0.05555221547526003, + "flos": 18624603024000.0, + "grad_norm": 2.8659886661620924, + "language_loss": 0.87038732, + "learning_rate": 3.993185903693384e-06, + "loss": 0.89376307, + "num_input_tokens_seen": 10011975, + "step": 462, + "time_per_iteration": 2.7012832164764404 + }, + { + "auxiliary_loss_clip": 0.01210792, + "auxiliary_loss_mlp": 0.0110351, + "balance_loss_clip": 1.06424737, + "balance_loss_mlp": 1.01629674, + "epoch": 0.05567245836589912, + "flos": 23587098410880.0, + "grad_norm": 2.1187828868720255, + "language_loss": 0.82334989, + "learning_rate": 3.9931215051131995e-06, + "loss": 0.84649289, + "num_input_tokens_seen": 10032620, + "step": 463, + "time_per_iteration": 2.7994954586029053 + }, + { + "auxiliary_loss_clip": 0.01220606, + "auxiliary_loss_mlp": 0.01100446, + "balance_loss_clip": 1.06933475, + "balance_loss_mlp": 1.01318526, + "epoch": 0.05579270125653821, + "flos": 27746667129600.0, + "grad_norm": 1.5870084075353297, + "language_loss": 0.79993516, + "learning_rate": 3.993056804178068e-06, + "loss": 0.82314563, + "num_input_tokens_seen": 10054165, + "step": 464, + "time_per_iteration": 2.8721659183502197 + }, + { + "auxiliary_loss_clip": 0.01189316, + "auxiliary_loss_mlp": 0.01102786, + "balance_loss_clip": 1.06409192, + "balance_loss_mlp": 1.01557291, + "epoch": 0.0559129441471773, + "flos": 27014161075200.0, + "grad_norm": 2.193390546895112, + "language_loss": 0.84304786, + "learning_rate": 3.992991800897803e-06, + "loss": 0.86596894, + "num_input_tokens_seen": 10073970, + "step": 465, + "time_per_iteration": 2.914649724960327 + }, + { + "auxiliary_loss_clip": 0.0123669, + "auxiliary_loss_mlp": 0.01102169, + "balance_loss_clip": 1.06873965, + "balance_loss_mlp": 1.01519442, + "epoch": 0.05603318703781639, + "flos": 15229787794560.0, + "grad_norm": 2.28304833651097, + "language_loss": 0.9001323, + "learning_rate": 3.9929264952822665e-06, + "loss": 0.92352092, + "num_input_tokens_seen": 10091505, + "step": 466, + "time_per_iteration": 2.8075547218322754 + }, + { + "auxiliary_loss_clip": 0.01226136, + "auxiliary_loss_mlp": 0.01104462, + "balance_loss_clip": 1.06811035, + "balance_loss_mlp": 1.01715279, + "epoch": 0.05615342992845548, + "flos": 22266482976000.0, + "grad_norm": 1.9681163527930314, + "language_loss": 0.88210505, + "learning_rate": 3.992860887341366e-06, + "loss": 0.90541106, + "num_input_tokens_seen": 10109675, + "step": 467, + "time_per_iteration": 2.7408792972564697 + }, + { + "auxiliary_loss_clip": 0.01197371, + "auxiliary_loss_mlp": 0.01099784, + "balance_loss_clip": 1.06643271, + "balance_loss_mlp": 1.01257074, + "epoch": 0.056273672819094574, + "flos": 23584979508480.0, + "grad_norm": 1.8953790157684136, + "language_loss": 0.81084478, + "learning_rate": 3.992794977085052e-06, + "loss": 0.83381629, + "num_input_tokens_seen": 10127675, + "step": 468, + "time_per_iteration": 2.843146324157715 + }, + { + "auxiliary_loss_clip": 0.01197268, + "auxiliary_loss_mlp": 0.01101715, + "balance_loss_clip": 1.06001794, + "balance_loss_mlp": 1.01459718, + "epoch": 0.056393915709733664, + "flos": 19858708552320.0, + "grad_norm": 2.473140152248595, + "language_loss": 0.85018545, + "learning_rate": 3.992728764523326e-06, + "loss": 0.87317532, + "num_input_tokens_seen": 10146620, + "step": 469, + "time_per_iteration": 2.909348964691162 + }, + { + "auxiliary_loss_clip": 0.01213539, + "auxiliary_loss_mlp": 0.01102422, + "balance_loss_clip": 1.06620884, + "balance_loss_mlp": 1.01530397, + "epoch": 0.05651415860037275, + "flos": 22163779013760.0, + "grad_norm": 1.685551153909567, + "language_loss": 0.80861235, + "learning_rate": 3.99266224966623e-06, + "loss": 0.83177197, + "num_input_tokens_seen": 10167535, + "step": 470, + "time_per_iteration": 2.846602201461792 + }, + { + "auxiliary_loss_clip": 0.01205936, + "auxiliary_loss_mlp": 0.01103074, + "balance_loss_clip": 1.06164086, + "balance_loss_mlp": 1.016433, + "epoch": 0.05663440149101184, + "flos": 19463548055040.0, + "grad_norm": 10.49299538916764, + "language_loss": 0.87964797, + "learning_rate": 3.992595432523855e-06, + "loss": 0.90273809, + "num_input_tokens_seen": 10184825, + "step": 471, + "time_per_iteration": 2.793323516845703 + }, + { + "auxiliary_loss_clip": 0.01191262, + "auxiliary_loss_mlp": 0.01098416, + "balance_loss_clip": 1.05805302, + "balance_loss_mlp": 1.01187038, + "epoch": 0.05675464438165093, + "flos": 22670226823680.0, + "grad_norm": 1.8881505693556133, + "language_loss": 0.86200595, + "learning_rate": 3.992528313106338e-06, + "loss": 0.88490272, + "num_input_tokens_seen": 10203025, + "step": 472, + "time_per_iteration": 2.871325731277466 + }, + { + "auxiliary_loss_clip": 0.0123888, + "auxiliary_loss_mlp": 0.00876743, + "balance_loss_clip": 1.07191682, + "balance_loss_mlp": 1.00100815, + "epoch": 0.05687488727229002, + "flos": 16901177495040.0, + "grad_norm": 2.72422156289922, + "language_loss": 0.82212734, + "learning_rate": 3.9924608914238595e-06, + "loss": 0.84328353, + "num_input_tokens_seen": 10218020, + "step": 473, + "time_per_iteration": 2.8481760025024414 + }, + { + "auxiliary_loss_clip": 0.01222829, + "auxiliary_loss_mlp": 0.01097495, + "balance_loss_clip": 1.06875443, + "balance_loss_mlp": 1.01066327, + "epoch": 0.05699513016292912, + "flos": 29168980945920.0, + "grad_norm": 2.1877252322812364, + "language_loss": 0.83896124, + "learning_rate": 3.992393167486648e-06, + "loss": 0.8621645, + "num_input_tokens_seen": 10237170, + "step": 474, + "time_per_iteration": 2.8942067623138428 + }, + { + "auxiliary_loss_clip": 0.0123773, + "auxiliary_loss_mlp": 0.01099663, + "balance_loss_clip": 1.07081044, + "balance_loss_mlp": 1.01249695, + "epoch": 0.05711537305356821, + "flos": 18916197632640.0, + "grad_norm": 3.0271102812379533, + "language_loss": 0.80752337, + "learning_rate": 3.992325141304977e-06, + "loss": 0.83089727, + "num_input_tokens_seen": 10255125, + "step": 475, + "time_per_iteration": 4.616311073303223 + }, + { + "auxiliary_loss_clip": 0.01196739, + "auxiliary_loss_mlp": 0.01099827, + "balance_loss_clip": 1.05847883, + "balance_loss_mlp": 1.01294684, + "epoch": 0.0572356159442073, + "flos": 26758979879040.0, + "grad_norm": 2.288131265605499, + "language_loss": 0.86865127, + "learning_rate": 3.992256812889166e-06, + "loss": 0.89161688, + "num_input_tokens_seen": 10271230, + "step": 476, + "time_per_iteration": 4.032567262649536 + }, + { + "auxiliary_loss_clip": 0.01239848, + "auxiliary_loss_mlp": 0.0110043, + "balance_loss_clip": 1.07303429, + "balance_loss_mlp": 1.0137887, + "epoch": 0.05735585883484639, + "flos": 35116146840960.0, + "grad_norm": 2.24252680362147, + "language_loss": 0.76573038, + "learning_rate": 3.992188182249582e-06, + "loss": 0.78913313, + "num_input_tokens_seen": 10293125, + "step": 477, + "time_per_iteration": 3.003391742706299 + }, + { + "auxiliary_loss_clip": 0.01203421, + "auxiliary_loss_mlp": 0.01103026, + "balance_loss_clip": 1.06109762, + "balance_loss_mlp": 1.01595545, + "epoch": 0.05747610172548548, + "flos": 18734381965440.0, + "grad_norm": 2.0844828780455336, + "language_loss": 0.90398729, + "learning_rate": 3.992119249396633e-06, + "loss": 0.92705178, + "num_input_tokens_seen": 10311810, + "step": 478, + "time_per_iteration": 3.8250279426574707 + }, + { + "auxiliary_loss_clip": 0.01208697, + "auxiliary_loss_mlp": 0.0087664, + "balance_loss_clip": 1.06124091, + "balance_loss_mlp": 1.00094891, + "epoch": 0.05759634461612457, + "flos": 27964752554880.0, + "grad_norm": 1.8534103403488011, + "language_loss": 0.82255781, + "learning_rate": 3.992050014340778e-06, + "loss": 0.84341121, + "num_input_tokens_seen": 10332165, + "step": 479, + "time_per_iteration": 2.974686622619629 + }, + { + "auxiliary_loss_clip": 0.01206663, + "auxiliary_loss_mlp": 0.01081813, + "balance_loss_clip": 1.06216097, + "balance_loss_mlp": 1.00017834, + "epoch": 0.057716587506763666, + "flos": 69292009405440.0, + "grad_norm": 0.8454397020991076, + "language_loss": 0.55052888, + "learning_rate": 3.99198047709252e-06, + "loss": 0.57341361, + "num_input_tokens_seen": 10393685, + "step": 480, + "time_per_iteration": 3.525174617767334 + }, + { + "auxiliary_loss_clip": 0.01195531, + "auxiliary_loss_mlp": 0.01097724, + "balance_loss_clip": 1.06151366, + "balance_loss_mlp": 1.01074886, + "epoch": 0.057836830397402755, + "flos": 25009196745600.0, + "grad_norm": 1.7381876804897256, + "language_loss": 0.78799522, + "learning_rate": 3.991910637662408e-06, + "loss": 0.81092781, + "num_input_tokens_seen": 10413975, + "step": 481, + "time_per_iteration": 3.0296356678009033 + }, + { + "auxiliary_loss_clip": 0.01235035, + "auxiliary_loss_mlp": 0.01098143, + "balance_loss_clip": 1.06883311, + "balance_loss_mlp": 1.01169229, + "epoch": 0.057957073288041845, + "flos": 25593894334080.0, + "grad_norm": 3.012103133012173, + "language_loss": 0.80835408, + "learning_rate": 3.9918404960610355e-06, + "loss": 0.83168584, + "num_input_tokens_seen": 10433005, + "step": 482, + "time_per_iteration": 2.850440740585327 + }, + { + "auxiliary_loss_clip": 0.01227013, + "auxiliary_loss_mlp": 0.01105747, + "balance_loss_clip": 1.06854904, + "balance_loss_mlp": 1.01867676, + "epoch": 0.058077316178680935, + "flos": 20777411733120.0, + "grad_norm": 2.166962364174723, + "language_loss": 0.7754426, + "learning_rate": 3.991770052299043e-06, + "loss": 0.79877019, + "num_input_tokens_seen": 10451235, + "step": 483, + "time_per_iteration": 2.766313314437866 + }, + { + "auxiliary_loss_clip": 0.01216235, + "auxiliary_loss_mlp": 0.01101776, + "balance_loss_clip": 1.06702483, + "balance_loss_mlp": 1.01499116, + "epoch": 0.058197559069320025, + "flos": 18916484941440.0, + "grad_norm": 2.215454341834186, + "language_loss": 0.87421906, + "learning_rate": 3.991699306387118e-06, + "loss": 0.89739919, + "num_input_tokens_seen": 10469705, + "step": 484, + "time_per_iteration": 2.836282253265381 + }, + { + "auxiliary_loss_clip": 0.01228002, + "auxiliary_loss_mlp": 0.01102415, + "balance_loss_clip": 1.06941545, + "balance_loss_mlp": 1.01577401, + "epoch": 0.058317801959959115, + "flos": 24863327614080.0, + "grad_norm": 1.8420328757963091, + "language_loss": 0.78100759, + "learning_rate": 3.991628258335991e-06, + "loss": 0.80431175, + "num_input_tokens_seen": 10491910, + "step": 485, + "time_per_iteration": 2.8096108436584473 + }, + { + "auxiliary_loss_clip": 0.01193631, + "auxiliary_loss_mlp": 0.01097326, + "balance_loss_clip": 1.05757761, + "balance_loss_mlp": 1.01106596, + "epoch": 0.05843804485059821, + "flos": 23257977068160.0, + "grad_norm": 3.583109557466314, + "language_loss": 0.88050634, + "learning_rate": 3.991556908156442e-06, + "loss": 0.90341592, + "num_input_tokens_seen": 10508435, + "step": 486, + "time_per_iteration": 2.8398098945617676 + }, + { + "auxiliary_loss_clip": 0.01215059, + "auxiliary_loss_mlp": 0.01099377, + "balance_loss_clip": 1.06605721, + "balance_loss_mlp": 1.01244974, + "epoch": 0.0585582877412373, + "flos": 23150532510720.0, + "grad_norm": 2.2951092301104112, + "language_loss": 0.87929201, + "learning_rate": 3.9914852558592914e-06, + "loss": 0.90243638, + "num_input_tokens_seen": 10529485, + "step": 487, + "time_per_iteration": 2.9044716358184814 + }, + { + "auxiliary_loss_clip": 0.01223192, + "auxiliary_loss_mlp": 0.01099854, + "balance_loss_clip": 1.06812572, + "balance_loss_mlp": 1.01278341, + "epoch": 0.05867853063187639, + "flos": 23506406507520.0, + "grad_norm": 2.791012521941033, + "language_loss": 0.80551344, + "learning_rate": 3.991413301455413e-06, + "loss": 0.82874393, + "num_input_tokens_seen": 10545935, + "step": 488, + "time_per_iteration": 2.791477918624878 + }, + { + "auxiliary_loss_clip": 0.01209984, + "auxiliary_loss_mlp": 0.01096795, + "balance_loss_clip": 1.06177807, + "balance_loss_mlp": 1.01043963, + "epoch": 0.05879877352251548, + "flos": 29495803818240.0, + "grad_norm": 3.5921150414293486, + "language_loss": 0.77630055, + "learning_rate": 3.991341044955719e-06, + "loss": 0.79936838, + "num_input_tokens_seen": 10565690, + "step": 489, + "time_per_iteration": 2.8334317207336426 + }, + { + "auxiliary_loss_clip": 0.01223019, + "auxiliary_loss_mlp": 0.00876646, + "balance_loss_clip": 1.06577206, + "balance_loss_mlp": 1.00084269, + "epoch": 0.05891901641315457, + "flos": 20157485880960.0, + "grad_norm": 2.1314656366540143, + "language_loss": 0.81495708, + "learning_rate": 3.991268486371172e-06, + "loss": 0.83595377, + "num_input_tokens_seen": 10584245, + "step": 490, + "time_per_iteration": 2.7596964836120605 + }, + { + "auxiliary_loss_clip": 0.0121371, + "auxiliary_loss_mlp": 0.01100152, + "balance_loss_clip": 1.06679893, + "balance_loss_mlp": 1.01346338, + "epoch": 0.05903925930379366, + "flos": 24644200694400.0, + "grad_norm": 2.750648366196763, + "language_loss": 0.88038921, + "learning_rate": 3.991195625712779e-06, + "loss": 0.90352792, + "num_input_tokens_seen": 10601210, + "step": 491, + "time_per_iteration": 2.7909905910491943 + }, + { + "auxiliary_loss_clip": 0.01234989, + "auxiliary_loss_mlp": 0.01096871, + "balance_loss_clip": 1.06918263, + "balance_loss_mlp": 1.01051617, + "epoch": 0.05915950219443276, + "flos": 21250391045760.0, + "grad_norm": 2.241100632475172, + "language_loss": 0.8167575, + "learning_rate": 3.991122462991592e-06, + "loss": 0.84007609, + "num_input_tokens_seen": 10620730, + "step": 492, + "time_per_iteration": 2.7367031574249268 + }, + { + "auxiliary_loss_clip": 0.01236532, + "auxiliary_loss_mlp": 0.01101453, + "balance_loss_clip": 1.06992364, + "balance_loss_mlp": 1.01447797, + "epoch": 0.05927974508507185, + "flos": 9902727319680.0, + "grad_norm": 3.2586443857987937, + "language_loss": 0.81111449, + "learning_rate": 3.991048998218712e-06, + "loss": 0.83449435, + "num_input_tokens_seen": 10634035, + "step": 493, + "time_per_iteration": 2.731767177581787 + }, + { + "auxiliary_loss_clip": 0.01226602, + "auxiliary_loss_mlp": 0.01100257, + "balance_loss_clip": 1.06958485, + "balance_loss_mlp": 1.01347244, + "epoch": 0.05939998797571094, + "flos": 18259499232000.0, + "grad_norm": 2.160111805297912, + "language_loss": 0.76765919, + "learning_rate": 3.990975231405281e-06, + "loss": 0.79092777, + "num_input_tokens_seen": 10652485, + "step": 494, + "time_per_iteration": 2.7386748790740967 + }, + { + "auxiliary_loss_clip": 0.01218858, + "auxiliary_loss_mlp": 0.01103029, + "balance_loss_clip": 1.06934011, + "balance_loss_mlp": 1.01662588, + "epoch": 0.05952023086635003, + "flos": 28256598558720.0, + "grad_norm": 1.9911596545754562, + "language_loss": 0.78725207, + "learning_rate": 3.990901162562491e-06, + "loss": 0.81047094, + "num_input_tokens_seen": 10673175, + "step": 495, + "time_per_iteration": 2.743014335632324 + }, + { + "auxiliary_loss_clip": 0.01193381, + "auxiliary_loss_mlp": 0.0087659, + "balance_loss_clip": 1.06252837, + "balance_loss_mlp": 1.0008204, + "epoch": 0.05964047375698912, + "flos": 14902498045440.0, + "grad_norm": 1.8819576555840911, + "language_loss": 0.90490669, + "learning_rate": 3.9908267917015765e-06, + "loss": 0.92560643, + "num_input_tokens_seen": 10691235, + "step": 496, + "time_per_iteration": 2.8337786197662354 + }, + { + "auxiliary_loss_clip": 0.01219949, + "auxiliary_loss_mlp": 0.01102391, + "balance_loss_clip": 1.06299269, + "balance_loss_mlp": 1.01532006, + "epoch": 0.059760716647628206, + "flos": 23185581206400.0, + "grad_norm": 2.0923948554392795, + "language_loss": 0.92833501, + "learning_rate": 3.990752118833821e-06, + "loss": 0.95155847, + "num_input_tokens_seen": 10708675, + "step": 497, + "time_per_iteration": 2.681034803390503 + }, + { + "auxiliary_loss_clip": 0.01237141, + "auxiliary_loss_mlp": 0.01096269, + "balance_loss_clip": 1.07148576, + "balance_loss_mlp": 1.00996101, + "epoch": 0.0598809595382673, + "flos": 22746968231040.0, + "grad_norm": 4.322501747182554, + "language_loss": 0.78263468, + "learning_rate": 3.990677143970553e-06, + "loss": 0.80596876, + "num_input_tokens_seen": 10729485, + "step": 498, + "time_per_iteration": 2.8218631744384766 + }, + { + "auxiliary_loss_clip": 0.01193385, + "auxiliary_loss_mlp": 0.0109897, + "balance_loss_clip": 1.06068778, + "balance_loss_mlp": 1.01256752, + "epoch": 0.06000120242890639, + "flos": 22127221946880.0, + "grad_norm": 5.1098836250588935, + "language_loss": 0.80970395, + "learning_rate": 3.990601867123144e-06, + "loss": 0.83262753, + "num_input_tokens_seen": 10749210, + "step": 499, + "time_per_iteration": 2.830597162246704 + }, + { + "auxiliary_loss_clip": 0.01181582, + "auxiliary_loss_mlp": 0.0109703, + "balance_loss_clip": 1.05597198, + "balance_loss_mlp": 1.0107224, + "epoch": 0.06012144531954548, + "flos": 19171773878400.0, + "grad_norm": 2.1413577471992866, + "language_loss": 0.84996104, + "learning_rate": 3.990526288303014e-06, + "loss": 0.87274712, + "num_input_tokens_seen": 10768000, + "step": 500, + "time_per_iteration": 4.619725227355957 + }, + { + "auxiliary_loss_clip": 0.0121068, + "auxiliary_loss_mlp": 0.00876477, + "balance_loss_clip": 1.06738853, + "balance_loss_mlp": 1.00077105, + "epoch": 0.06024168821018457, + "flos": 22783345729920.0, + "grad_norm": 1.7816954707167592, + "language_loss": 0.90783751, + "learning_rate": 3.9904504075216295e-06, + "loss": 0.92870903, + "num_input_tokens_seen": 10788760, + "step": 501, + "time_per_iteration": 3.7775988578796387 + }, + { + "auxiliary_loss_clip": 0.01202477, + "auxiliary_loss_mlp": 0.01101903, + "balance_loss_clip": 1.06403828, + "balance_loss_mlp": 1.01507139, + "epoch": 0.06036193110082366, + "flos": 18770687637120.0, + "grad_norm": 2.3294992283280065, + "language_loss": 0.93635035, + "learning_rate": 3.990374224790501e-06, + "loss": 0.95939422, + "num_input_tokens_seen": 10806965, + "step": 502, + "time_per_iteration": 3.750256299972534 + }, + { + "auxiliary_loss_clip": 0.01212955, + "auxiliary_loss_mlp": 0.01099038, + "balance_loss_clip": 1.06596816, + "balance_loss_mlp": 1.01258802, + "epoch": 0.06048217399146275, + "flos": 17201570935680.0, + "grad_norm": 2.1696975375958423, + "language_loss": 0.7088142, + "learning_rate": 3.990297740121185e-06, + "loss": 0.73193419, + "num_input_tokens_seen": 10824900, + "step": 503, + "time_per_iteration": 2.798902750015259 + }, + { + "auxiliary_loss_clip": 0.01218573, + "auxiliary_loss_mlp": 0.00876474, + "balance_loss_clip": 1.06367898, + "balance_loss_mlp": 1.00073791, + "epoch": 0.06060241688210185, + "flos": 24024131187840.0, + "grad_norm": 2.0170974729596876, + "language_loss": 0.7815001, + "learning_rate": 3.990220953525284e-06, + "loss": 0.80245054, + "num_input_tokens_seen": 10842010, + "step": 504, + "time_per_iteration": 2.7258706092834473 + }, + { + "auxiliary_loss_clip": 0.01208476, + "auxiliary_loss_mlp": 0.01100722, + "balance_loss_clip": 1.06238115, + "balance_loss_mlp": 1.01398551, + "epoch": 0.06072265977274094, + "flos": 14611190745600.0, + "grad_norm": 2.8339078257442574, + "language_loss": 0.74141318, + "learning_rate": 3.9901438650144465e-06, + "loss": 0.76450515, + "num_input_tokens_seen": 10858260, + "step": 505, + "time_per_iteration": 2.7310636043548584 + }, + { + "auxiliary_loss_clip": 0.01222327, + "auxiliary_loss_mlp": 0.01097389, + "balance_loss_clip": 1.06544471, + "balance_loss_mlp": 1.01103365, + "epoch": 0.06084290266338003, + "flos": 20558284813440.0, + "grad_norm": 2.3559748930105124, + "language_loss": 0.91689384, + "learning_rate": 3.990066474600367e-06, + "loss": 0.94009101, + "num_input_tokens_seen": 10876230, + "step": 506, + "time_per_iteration": 2.6634082794189453 + }, + { + "auxiliary_loss_clip": 0.01222483, + "auxiliary_loss_mlp": 0.01099758, + "balance_loss_clip": 1.06659269, + "balance_loss_mlp": 1.01325941, + "epoch": 0.06096314555401912, + "flos": 22309217182080.0, + "grad_norm": 1.8372003234621745, + "language_loss": 0.67928231, + "learning_rate": 3.989988782294786e-06, + "loss": 0.70250463, + "num_input_tokens_seen": 10896320, + "step": 507, + "time_per_iteration": 2.7944681644439697 + }, + { + "auxiliary_loss_clip": 0.01197089, + "auxiliary_loss_mlp": 0.01102982, + "balance_loss_clip": 1.05985093, + "balance_loss_mlp": 1.0164361, + "epoch": 0.06108338844465821, + "flos": 19131374056320.0, + "grad_norm": 1.7483729526275968, + "language_loss": 0.94985074, + "learning_rate": 3.989910788109489e-06, + "loss": 0.97285146, + "num_input_tokens_seen": 10912970, + "step": 508, + "time_per_iteration": 2.8101582527160645 + }, + { + "auxiliary_loss_clip": 0.01193919, + "auxiliary_loss_mlp": 0.01099241, + "balance_loss_clip": 1.06240511, + "balance_loss_mlp": 1.01250422, + "epoch": 0.0612036313352973, + "flos": 33584018169600.0, + "grad_norm": 2.2755431209960113, + "language_loss": 0.74815702, + "learning_rate": 3.989832492056307e-06, + "loss": 0.7710886, + "num_input_tokens_seen": 10933995, + "step": 509, + "time_per_iteration": 2.955742120742798 + }, + { + "auxiliary_loss_clip": 0.0122385, + "auxiliary_loss_mlp": 0.01098969, + "balance_loss_clip": 1.069121, + "balance_loss_mlp": 1.01256609, + "epoch": 0.06132387422593639, + "flos": 27490552179840.0, + "grad_norm": 2.611672361968857, + "language_loss": 0.80876493, + "learning_rate": 3.989753894147119e-06, + "loss": 0.8319931, + "num_input_tokens_seen": 10954120, + "step": 510, + "time_per_iteration": 2.773916482925415 + }, + { + "auxiliary_loss_clip": 0.01222655, + "auxiliary_loss_mlp": 0.01099054, + "balance_loss_clip": 1.06916785, + "balance_loss_mlp": 1.01346135, + "epoch": 0.061444117116575485, + "flos": 25885057979520.0, + "grad_norm": 2.4304752705040924, + "language_loss": 0.79903388, + "learning_rate": 3.989674994393846e-06, + "loss": 0.82225102, + "num_input_tokens_seen": 10973595, + "step": 511, + "time_per_iteration": 2.75158429145813 + }, + { + "auxiliary_loss_clip": 0.01223368, + "auxiliary_loss_mlp": 0.01100781, + "balance_loss_clip": 1.06902385, + "balance_loss_mlp": 1.01480782, + "epoch": 0.061564360007214575, + "flos": 28512031150080.0, + "grad_norm": 1.9583231373973604, + "language_loss": 0.93967056, + "learning_rate": 3.98959579280846e-06, + "loss": 0.96291208, + "num_input_tokens_seen": 10991995, + "step": 512, + "time_per_iteration": 2.7993125915527344 + }, + { + "auxiliary_loss_clip": 0.01173886, + "auxiliary_loss_mlp": 0.01098801, + "balance_loss_clip": 1.06061876, + "balance_loss_mlp": 1.01292324, + "epoch": 0.061684602897853665, + "flos": 12094355652480.0, + "grad_norm": 1.951059488439664, + "language_loss": 0.82753825, + "learning_rate": 3.989516289402973e-06, + "loss": 0.85026515, + "num_input_tokens_seen": 11007625, + "step": 513, + "time_per_iteration": 2.967564105987549 + }, + { + "auxiliary_loss_clip": 0.01172007, + "auxiliary_loss_mlp": 0.01099597, + "balance_loss_clip": 1.06052566, + "balance_loss_mlp": 1.01348007, + "epoch": 0.061804845788492754, + "flos": 19532639865600.0, + "grad_norm": 4.020277672657811, + "language_loss": 0.79988742, + "learning_rate": 3.989436484189447e-06, + "loss": 0.82260352, + "num_input_tokens_seen": 11025570, + "step": 514, + "time_per_iteration": 2.9876182079315186 + }, + { + "auxiliary_loss_clip": 0.01223074, + "auxiliary_loss_mlp": 0.01099347, + "balance_loss_clip": 1.0674715, + "balance_loss_mlp": 1.01280117, + "epoch": 0.061925088679131844, + "flos": 15341111020800.0, + "grad_norm": 2.290080251154748, + "language_loss": 0.80008048, + "learning_rate": 3.9893563771799885e-06, + "loss": 0.82330477, + "num_input_tokens_seen": 11042045, + "step": 515, + "time_per_iteration": 2.7245237827301025 + }, + { + "auxiliary_loss_clip": 0.01232257, + "auxiliary_loss_mlp": 0.01102209, + "balance_loss_clip": 1.06809926, + "balance_loss_mlp": 1.01542449, + "epoch": 0.062045331569770934, + "flos": 25919927107200.0, + "grad_norm": 2.0471322873136693, + "language_loss": 0.86196125, + "learning_rate": 3.989275968386749e-06, + "loss": 0.88530588, + "num_input_tokens_seen": 11059955, + "step": 516, + "time_per_iteration": 2.793382167816162 + }, + { + "auxiliary_loss_clip": 0.01211462, + "auxiliary_loss_mlp": 0.01099274, + "balance_loss_clip": 1.06467056, + "balance_loss_mlp": 1.01306224, + "epoch": 0.06216557446041003, + "flos": 28110621686400.0, + "grad_norm": 2.034717299720939, + "language_loss": 0.76738834, + "learning_rate": 3.989195257821926e-06, + "loss": 0.79049575, + "num_input_tokens_seen": 11078440, + "step": 517, + "time_per_iteration": 2.8510775566101074 + }, + { + "auxiliary_loss_clip": 0.01204016, + "auxiliary_loss_mlp": 0.01101695, + "balance_loss_clip": 1.06407285, + "balance_loss_mlp": 1.01486325, + "epoch": 0.06228581735104912, + "flos": 23478181395840.0, + "grad_norm": 2.111828476149962, + "language_loss": 0.8432464, + "learning_rate": 3.989114245497765e-06, + "loss": 0.86630356, + "num_input_tokens_seen": 11098240, + "step": 518, + "time_per_iteration": 2.878568410873413 + }, + { + "auxiliary_loss_clip": 0.01220584, + "auxiliary_loss_mlp": 0.01096127, + "balance_loss_clip": 1.06447935, + "balance_loss_mlp": 1.01001048, + "epoch": 0.06240606024168821, + "flos": 15195205975680.0, + "grad_norm": 2.8827027899878597, + "language_loss": 0.94734716, + "learning_rate": 3.989032931426554e-06, + "loss": 0.9705143, + "num_input_tokens_seen": 11115395, + "step": 519, + "time_per_iteration": 2.817486047744751 + }, + { + "auxiliary_loss_clip": 0.01196165, + "auxiliary_loss_mlp": 0.0110074, + "balance_loss_clip": 1.05568123, + "balance_loss_mlp": 1.01462293, + "epoch": 0.06252630313232731, + "flos": 20631829910400.0, + "grad_norm": 2.164808736534649, + "language_loss": 0.86655617, + "learning_rate": 3.9889513156206295e-06, + "loss": 0.88952518, + "num_input_tokens_seen": 11134835, + "step": 520, + "time_per_iteration": 2.75494647026062 + }, + { + "auxiliary_loss_clip": 0.01196214, + "auxiliary_loss_mlp": 0.01097342, + "balance_loss_clip": 1.06195569, + "balance_loss_mlp": 1.01108181, + "epoch": 0.06264654602296639, + "flos": 20778058177920.0, + "grad_norm": 3.077770615165173, + "language_loss": 0.73869848, + "learning_rate": 3.988869398092371e-06, + "loss": 0.76163411, + "num_input_tokens_seen": 11154745, + "step": 521, + "time_per_iteration": 2.915292978286743 + }, + { + "auxiliary_loss_clip": 0.01208037, + "auxiliary_loss_mlp": 0.0109695, + "balance_loss_clip": 1.06435668, + "balance_loss_mlp": 1.01059473, + "epoch": 0.06276678891360549, + "flos": 29605798241280.0, + "grad_norm": 2.744848681418098, + "language_loss": 0.78829658, + "learning_rate": 3.988787178854206e-06, + "loss": 0.81134641, + "num_input_tokens_seen": 11174280, + "step": 522, + "time_per_iteration": 2.7935280799865723 + }, + { + "auxiliary_loss_clip": 0.01233898, + "auxiliary_loss_mlp": 0.01098422, + "balance_loss_clip": 1.06956887, + "balance_loss_mlp": 1.01211488, + "epoch": 0.06288703180424457, + "flos": 22126288193280.0, + "grad_norm": 2.093709767952153, + "language_loss": 0.8731848, + "learning_rate": 3.988704657918608e-06, + "loss": 0.89650798, + "num_input_tokens_seen": 11193340, + "step": 523, + "time_per_iteration": 2.7894999980926514 + }, + { + "auxiliary_loss_clip": 0.01217106, + "auxiliary_loss_mlp": 0.01101351, + "balance_loss_clip": 1.06391776, + "balance_loss_mlp": 1.01556778, + "epoch": 0.06300727469488367, + "flos": 14976689587200.0, + "grad_norm": 2.500697484061014, + "language_loss": 0.79754645, + "learning_rate": 3.988621835298094e-06, + "loss": 0.82073104, + "num_input_tokens_seen": 11210555, + "step": 524, + "time_per_iteration": 2.7180421352386475 + }, + { + "auxiliary_loss_clip": 0.01233993, + "auxiliary_loss_mlp": 0.01099092, + "balance_loss_clip": 1.07022643, + "balance_loss_mlp": 1.01330876, + "epoch": 0.06312751758552275, + "flos": 24535391420160.0, + "grad_norm": 2.0193457873222362, + "language_loss": 0.91547346, + "learning_rate": 3.988538711005229e-06, + "loss": 0.93880433, + "num_input_tokens_seen": 11230010, + "step": 525, + "time_per_iteration": 4.576650142669678 + }, + { + "auxiliary_loss_clip": 0.01211644, + "auxiliary_loss_mlp": 0.01097493, + "balance_loss_clip": 1.06054139, + "balance_loss_mlp": 1.0118531, + "epoch": 0.06324776047616185, + "flos": 21507008785920.0, + "grad_norm": 2.1567138232403664, + "language_loss": 0.87943947, + "learning_rate": 3.988455285052622e-06, + "loss": 0.90253091, + "num_input_tokens_seen": 11246190, + "step": 526, + "time_per_iteration": 2.8017988204956055 + }, + { + "auxiliary_loss_clip": 0.01219821, + "auxiliary_loss_mlp": 0.01096737, + "balance_loss_clip": 1.06505668, + "balance_loss_mlp": 1.01104951, + "epoch": 0.06336800336680094, + "flos": 21688034353920.0, + "grad_norm": 1.9357462956253555, + "language_loss": 0.83599329, + "learning_rate": 3.98837155745293e-06, + "loss": 0.85915887, + "num_input_tokens_seen": 11264230, + "step": 527, + "time_per_iteration": 3.7356321811676025 + }, + { + "auxiliary_loss_clip": 0.01216815, + "auxiliary_loss_mlp": 0.01101649, + "balance_loss_clip": 1.06436479, + "balance_loss_mlp": 1.01548409, + "epoch": 0.06348824625744003, + "flos": 19500895221120.0, + "grad_norm": 3.1294775790338805, + "language_loss": 0.75820225, + "learning_rate": 3.988287528218854e-06, + "loss": 0.78138685, + "num_input_tokens_seen": 11283015, + "step": 528, + "time_per_iteration": 3.640895128250122 + }, + { + "auxiliary_loss_clip": 0.01222128, + "auxiliary_loss_mlp": 0.01097435, + "balance_loss_clip": 1.0692451, + "balance_loss_mlp": 1.01184225, + "epoch": 0.06360848914807912, + "flos": 15481233976320.0, + "grad_norm": 2.115093588862589, + "language_loss": 0.90289843, + "learning_rate": 3.98820319736314e-06, + "loss": 0.92609406, + "num_input_tokens_seen": 11299630, + "step": 529, + "time_per_iteration": 2.699403762817383 + }, + { + "auxiliary_loss_clip": 0.01202903, + "auxiliary_loss_mlp": 0.01095233, + "balance_loss_clip": 1.06574798, + "balance_loss_mlp": 1.00916338, + "epoch": 0.0637287320387182, + "flos": 20593369422720.0, + "grad_norm": 1.8288463833221043, + "language_loss": 0.85190558, + "learning_rate": 3.988118564898582e-06, + "loss": 0.87488687, + "num_input_tokens_seen": 11319170, + "step": 530, + "time_per_iteration": 2.8470535278320312 + }, + { + "auxiliary_loss_clip": 0.01197126, + "auxiliary_loss_mlp": 0.00876331, + "balance_loss_clip": 1.0634228, + "balance_loss_mlp": 1.00057697, + "epoch": 0.0638489749293573, + "flos": 17412222245760.0, + "grad_norm": 2.4200436338467837, + "language_loss": 0.8934899, + "learning_rate": 3.988033630838019e-06, + "loss": 0.91422451, + "num_input_tokens_seen": 11333210, + "step": 531, + "time_per_iteration": 2.8529415130615234 + }, + { + "auxiliary_loss_clip": 0.01225181, + "auxiliary_loss_mlp": 0.01098487, + "balance_loss_clip": 1.06972742, + "balance_loss_mlp": 1.01227498, + "epoch": 0.0639692178199964, + "flos": 23807661874560.0, + "grad_norm": 1.714030950166966, + "language_loss": 0.87803304, + "learning_rate": 3.987948395194334e-06, + "loss": 0.90126967, + "num_input_tokens_seen": 11355590, + "step": 532, + "time_per_iteration": 2.8085238933563232 + }, + { + "auxiliary_loss_clip": 0.01217296, + "auxiliary_loss_mlp": 0.01095556, + "balance_loss_clip": 1.06299329, + "balance_loss_mlp": 1.00967777, + "epoch": 0.06408946071063548, + "flos": 18477225521280.0, + "grad_norm": 2.1905649241137954, + "language_loss": 0.76472259, + "learning_rate": 3.987862857980458e-06, + "loss": 0.78785115, + "num_input_tokens_seen": 11371535, + "step": 533, + "time_per_iteration": 2.7234623432159424 + }, + { + "auxiliary_loss_clip": 0.01196927, + "auxiliary_loss_mlp": 0.01096143, + "balance_loss_clip": 1.06239545, + "balance_loss_mlp": 1.01078904, + "epoch": 0.06420970360127458, + "flos": 27162220936320.0, + "grad_norm": 1.9261403521228035, + "language_loss": 0.77062118, + "learning_rate": 3.987777019209368e-06, + "loss": 0.79355186, + "num_input_tokens_seen": 11392050, + "step": 534, + "time_per_iteration": 2.9062283039093018 + }, + { + "auxiliary_loss_clip": 0.01231627, + "auxiliary_loss_mlp": 0.01097719, + "balance_loss_clip": 1.06846356, + "balance_loss_mlp": 1.01203179, + "epoch": 0.06432994649191366, + "flos": 23659673840640.0, + "grad_norm": 1.8305813684854098, + "language_loss": 0.8127175, + "learning_rate": 3.987690878894084e-06, + "loss": 0.83601093, + "num_input_tokens_seen": 11411765, + "step": 535, + "time_per_iteration": 2.7481207847595215 + }, + { + "auxiliary_loss_clip": 0.01211169, + "auxiliary_loss_mlp": 0.01096912, + "balance_loss_clip": 1.06469226, + "balance_loss_mlp": 1.01103354, + "epoch": 0.06445018938255276, + "flos": 23403953940480.0, + "grad_norm": 2.937998384394042, + "language_loss": 0.85226369, + "learning_rate": 3.987604437047673e-06, + "loss": 0.87534451, + "num_input_tokens_seen": 11431565, + "step": 536, + "time_per_iteration": 2.7482194900512695 + }, + { + "auxiliary_loss_clip": 0.01218844, + "auxiliary_loss_mlp": 0.01097834, + "balance_loss_clip": 1.06597328, + "balance_loss_mlp": 1.01186037, + "epoch": 0.06457043227319184, + "flos": 19646692525440.0, + "grad_norm": 2.070171242512328, + "language_loss": 0.77561057, + "learning_rate": 3.987517693683251e-06, + "loss": 0.79877734, + "num_input_tokens_seen": 11450140, + "step": 537, + "time_per_iteration": 2.703134059906006 + }, + { + "auxiliary_loss_clip": 0.01198294, + "auxiliary_loss_mlp": 0.01099751, + "balance_loss_clip": 1.06075287, + "balance_loss_mlp": 1.0141108, + "epoch": 0.06469067516383094, + "flos": 16978744915200.0, + "grad_norm": 2.748122869206779, + "language_loss": 0.95748526, + "learning_rate": 3.9874306488139745e-06, + "loss": 0.98046565, + "num_input_tokens_seen": 11465400, + "step": 538, + "time_per_iteration": 2.8475217819213867 + }, + { + "auxiliary_loss_clip": 0.01190877, + "auxiliary_loss_mlp": 0.01095014, + "balance_loss_clip": 1.05783129, + "balance_loss_mlp": 1.00966036, + "epoch": 0.06481091805447003, + "flos": 23296401642240.0, + "grad_norm": 3.152147254936746, + "language_loss": 0.87946981, + "learning_rate": 3.987343302453049e-06, + "loss": 0.90232873, + "num_input_tokens_seen": 11486675, + "step": 539, + "time_per_iteration": 2.873551368713379 + }, + { + "auxiliary_loss_clip": 0.01211274, + "auxiliary_loss_mlp": 0.01098352, + "balance_loss_clip": 1.06679463, + "balance_loss_mlp": 1.01271224, + "epoch": 0.06493116094510912, + "flos": 29172356824320.0, + "grad_norm": 1.6759164414522312, + "language_loss": 0.82707775, + "learning_rate": 3.987255654613724e-06, + "loss": 0.85017401, + "num_input_tokens_seen": 11510440, + "step": 540, + "time_per_iteration": 2.9118411540985107 + }, + { + "auxiliary_loss_clip": 0.01192324, + "auxiliary_loss_mlp": 0.01099576, + "balance_loss_clip": 1.05814373, + "balance_loss_mlp": 1.01364994, + "epoch": 0.06505140383574821, + "flos": 19865065259520.0, + "grad_norm": 2.20768535592758, + "language_loss": 0.70276457, + "learning_rate": 3.987167705309296e-06, + "loss": 0.72568357, + "num_input_tokens_seen": 11529715, + "step": 541, + "time_per_iteration": 2.8744752407073975 + }, + { + "auxiliary_loss_clip": 0.01221622, + "auxiliary_loss_mlp": 0.00876148, + "balance_loss_clip": 1.06687665, + "balance_loss_mlp": 1.00051367, + "epoch": 0.0651716467263873, + "flos": 17924703540480.0, + "grad_norm": 2.257425002863879, + "language_loss": 0.95315951, + "learning_rate": 3.987079454553108e-06, + "loss": 0.97413719, + "num_input_tokens_seen": 11547665, + "step": 542, + "time_per_iteration": 2.7443552017211914 + }, + { + "auxiliary_loss_clip": 0.01193297, + "auxiliary_loss_mlp": 0.01101654, + "balance_loss_clip": 1.06256604, + "balance_loss_mlp": 1.01577544, + "epoch": 0.0652918896170264, + "flos": 20842840356480.0, + "grad_norm": 1.882135538572027, + "language_loss": 0.91148651, + "learning_rate": 3.986990902358546e-06, + "loss": 0.93443608, + "num_input_tokens_seen": 11564605, + "step": 543, + "time_per_iteration": 2.86761736869812 + }, + { + "auxiliary_loss_clip": 0.01216936, + "auxiliary_loss_mlp": 0.01097823, + "balance_loss_clip": 1.06295693, + "balance_loss_mlp": 1.01175356, + "epoch": 0.06541213250766549, + "flos": 21872507627520.0, + "grad_norm": 3.0794208004075054, + "language_loss": 0.93519843, + "learning_rate": 3.986902048739045e-06, + "loss": 0.95834601, + "num_input_tokens_seen": 11584550, + "step": 544, + "time_per_iteration": 2.7373175621032715 + }, + { + "auxiliary_loss_clip": 0.01200519, + "auxiliary_loss_mlp": 0.0109597, + "balance_loss_clip": 1.05836177, + "balance_loss_mlp": 1.01047277, + "epoch": 0.06553237539830457, + "flos": 23110743219840.0, + "grad_norm": 2.674675499578808, + "language_loss": 0.80037463, + "learning_rate": 3.986812893708082e-06, + "loss": 0.82333952, + "num_input_tokens_seen": 11600740, + "step": 545, + "time_per_iteration": 2.798605442047119 + }, + { + "auxiliary_loss_clip": 0.01200503, + "auxiliary_loss_mlp": 0.01097272, + "balance_loss_clip": 1.06431198, + "balance_loss_mlp": 1.01144183, + "epoch": 0.06565261828894367, + "flos": 17923769786880.0, + "grad_norm": 2.0308881548767834, + "language_loss": 0.81155294, + "learning_rate": 3.9867234372791826e-06, + "loss": 0.83453071, + "num_input_tokens_seen": 11618695, + "step": 546, + "time_per_iteration": 2.8026528358459473 + }, + { + "auxiliary_loss_clip": 0.01217542, + "auxiliary_loss_mlp": 0.01096407, + "balance_loss_clip": 1.06396258, + "balance_loss_mlp": 1.01071954, + "epoch": 0.06577286117958275, + "flos": 22783058421120.0, + "grad_norm": 1.5661669297347816, + "language_loss": 0.8742367, + "learning_rate": 3.986633679465918e-06, + "loss": 0.89737618, + "num_input_tokens_seen": 11638850, + "step": 547, + "time_per_iteration": 2.8781256675720215 + }, + { + "auxiliary_loss_clip": 0.01175621, + "auxiliary_loss_mlp": 0.01099367, + "balance_loss_clip": 1.0535568, + "balance_loss_mlp": 1.0138706, + "epoch": 0.06589310407022185, + "flos": 23696194993920.0, + "grad_norm": 6.13965212562239, + "language_loss": 0.8023665, + "learning_rate": 3.986543620281904e-06, + "loss": 0.8251164, + "num_input_tokens_seen": 11658500, + "step": 548, + "time_per_iteration": 2.951533555984497 + }, + { + "auxiliary_loss_clip": 0.0119937, + "auxiliary_loss_mlp": 0.01097414, + "balance_loss_clip": 1.06438494, + "balance_loss_mlp": 1.01196456, + "epoch": 0.06601334696086093, + "flos": 26864772410880.0, + "grad_norm": 1.9037049249925349, + "language_loss": 0.91111892, + "learning_rate": 3.986453259740802e-06, + "loss": 0.93408674, + "num_input_tokens_seen": 11676670, + "step": 549, + "time_per_iteration": 2.853322982788086 + }, + { + "auxiliary_loss_clip": 0.01207635, + "auxiliary_loss_mlp": 0.01094623, + "balance_loss_clip": 1.06557298, + "balance_loss_mlp": 1.00922179, + "epoch": 0.06613358985150003, + "flos": 12567694101120.0, + "grad_norm": 2.640534290856007, + "language_loss": 0.78649998, + "learning_rate": 3.986362597856319e-06, + "loss": 0.80952257, + "num_input_tokens_seen": 11693170, + "step": 550, + "time_per_iteration": 2.8254644870758057 + }, + { + "auxiliary_loss_clip": 0.01201034, + "auxiliary_loss_mlp": 0.00876152, + "balance_loss_clip": 1.05748057, + "balance_loss_mlp": 1.00045609, + "epoch": 0.06625383274213913, + "flos": 18332505624960.0, + "grad_norm": 2.641809104681193, + "language_loss": 0.81748652, + "learning_rate": 3.986271634642211e-06, + "loss": 0.83825839, + "num_input_tokens_seen": 11710150, + "step": 551, + "time_per_iteration": 4.506532669067383 + }, + { + "auxiliary_loss_clip": 0.01230632, + "auxiliary_loss_mlp": 0.01099259, + "balance_loss_clip": 1.06903541, + "balance_loss_mlp": 1.01376188, + "epoch": 0.06637407563277821, + "flos": 15375585098880.0, + "grad_norm": 1.9301127587285882, + "language_loss": 0.81465149, + "learning_rate": 3.986180370112274e-06, + "loss": 0.83795047, + "num_input_tokens_seen": 11726670, + "step": 552, + "time_per_iteration": 3.693042278289795 + }, + { + "auxiliary_loss_clip": 0.01218686, + "auxiliary_loss_mlp": 0.00876141, + "balance_loss_clip": 1.06564403, + "balance_loss_mlp": 1.00045872, + "epoch": 0.0664943185234173, + "flos": 24025244509440.0, + "grad_norm": 1.6638274905601667, + "language_loss": 0.74404085, + "learning_rate": 3.986088804280354e-06, + "loss": 0.76498914, + "num_input_tokens_seen": 11746400, + "step": 553, + "time_per_iteration": 3.64679217338562 + }, + { + "auxiliary_loss_clip": 0.01207875, + "auxiliary_loss_mlp": 0.01096172, + "balance_loss_clip": 1.06460667, + "balance_loss_mlp": 1.01067543, + "epoch": 0.06661456141405639, + "flos": 20957503547520.0, + "grad_norm": 3.15326485298569, + "language_loss": 0.94251192, + "learning_rate": 3.985996937160342e-06, + "loss": 0.96555239, + "num_input_tokens_seen": 11765590, + "step": 554, + "time_per_iteration": 2.799109697341919 + }, + { + "auxiliary_loss_clip": 0.01218101, + "auxiliary_loss_mlp": 0.01098116, + "balance_loss_clip": 1.06613457, + "balance_loss_mlp": 1.01252413, + "epoch": 0.06673480430469549, + "flos": 52223953322880.0, + "grad_norm": 2.0238029993908118, + "language_loss": 0.6876353, + "learning_rate": 3.985904768766173e-06, + "loss": 0.71079743, + "num_input_tokens_seen": 11788365, + "step": 555, + "time_per_iteration": 3.0083162784576416 + }, + { + "auxiliary_loss_clip": 0.01198003, + "auxiliary_loss_mlp": 0.01099308, + "balance_loss_clip": 1.06300664, + "balance_loss_mlp": 1.01362014, + "epoch": 0.06685504719533458, + "flos": 16217079995520.0, + "grad_norm": 2.452533980330301, + "language_loss": 0.76079726, + "learning_rate": 3.98581229911183e-06, + "loss": 0.78377032, + "num_input_tokens_seen": 11807285, + "step": 556, + "time_per_iteration": 2.8918445110321045 + }, + { + "auxiliary_loss_clip": 0.01219948, + "auxiliary_loss_mlp": 0.01099404, + "balance_loss_clip": 1.06556296, + "balance_loss_mlp": 1.01362097, + "epoch": 0.06697529008597367, + "flos": 22491535639680.0, + "grad_norm": 1.9887337194031602, + "language_loss": 0.92101306, + "learning_rate": 3.985719528211341e-06, + "loss": 0.9442066, + "num_input_tokens_seen": 11826655, + "step": 557, + "time_per_iteration": 2.7631783485412598 + }, + { + "auxiliary_loss_clip": 0.01198047, + "auxiliary_loss_mlp": 0.01082031, + "balance_loss_clip": 1.06668901, + "balance_loss_mlp": 1.00077772, + "epoch": 0.06709553297661276, + "flos": 62688216936960.0, + "grad_norm": 0.8546672579755553, + "language_loss": 0.63111383, + "learning_rate": 3.985626456078777e-06, + "loss": 0.65391463, + "num_input_tokens_seen": 11891310, + "step": 558, + "time_per_iteration": 3.3631792068481445 + }, + { + "auxiliary_loss_clip": 0.01193373, + "auxiliary_loss_mlp": 0.01098911, + "balance_loss_clip": 1.06068766, + "balance_loss_mlp": 1.01384366, + "epoch": 0.06721577586725185, + "flos": 11216590997760.0, + "grad_norm": 2.3270904215452024, + "language_loss": 0.85940742, + "learning_rate": 3.985533082728259e-06, + "loss": 0.8823303, + "num_input_tokens_seen": 11906965, + "step": 559, + "time_per_iteration": 2.8254308700561523 + }, + { + "auxiliary_loss_clip": 0.0122763, + "auxiliary_loss_mlp": 0.01100351, + "balance_loss_clip": 1.06582272, + "balance_loss_mlp": 1.01461542, + "epoch": 0.06733601875789094, + "flos": 25922189664000.0, + "grad_norm": 1.7129460542848771, + "language_loss": 0.74955559, + "learning_rate": 3.985439408173951e-06, + "loss": 0.77283537, + "num_input_tokens_seen": 11927190, + "step": 560, + "time_per_iteration": 2.699427366256714 + }, + { + "auxiliary_loss_clip": 0.01228106, + "auxiliary_loss_mlp": 0.01097686, + "balance_loss_clip": 1.06610024, + "balance_loss_mlp": 1.01237941, + "epoch": 0.06745626164853002, + "flos": 20813645577600.0, + "grad_norm": 2.8813545529480344, + "language_loss": 0.70496166, + "learning_rate": 3.9853454324300634e-06, + "loss": 0.72821951, + "num_input_tokens_seen": 11946400, + "step": 561, + "time_per_iteration": 2.7723915576934814 + }, + { + "auxiliary_loss_clip": 0.01166534, + "auxiliary_loss_mlp": 0.01097334, + "balance_loss_clip": 1.05363524, + "balance_loss_mlp": 1.01174188, + "epoch": 0.06757650453916912, + "flos": 19829262378240.0, + "grad_norm": 4.12320833801696, + "language_loss": 0.77855086, + "learning_rate": 3.985251155510852e-06, + "loss": 0.80118954, + "num_input_tokens_seen": 11965430, + "step": 562, + "time_per_iteration": 2.9182467460632324 + }, + { + "auxiliary_loss_clip": 0.01167234, + "auxiliary_loss_mlp": 0.01096998, + "balance_loss_clip": 1.04715693, + "balance_loss_mlp": 1.01169181, + "epoch": 0.06769674742980822, + "flos": 25739224761600.0, + "grad_norm": 1.8237016491229974, + "language_loss": 0.80163062, + "learning_rate": 3.98515657743062e-06, + "loss": 0.82427299, + "num_input_tokens_seen": 11984895, + "step": 563, + "time_per_iteration": 2.901230812072754 + }, + { + "auxiliary_loss_clip": 0.01199016, + "auxiliary_loss_mlp": 0.01095608, + "balance_loss_clip": 1.05965829, + "balance_loss_mlp": 1.01020694, + "epoch": 0.0678169903204473, + "flos": 13074788355840.0, + "grad_norm": 1.9165547743093092, + "language_loss": 0.77504814, + "learning_rate": 3.985061698203711e-06, + "loss": 0.79799438, + "num_input_tokens_seen": 12002010, + "step": 564, + "time_per_iteration": 2.717193603515625 + }, + { + "auxiliary_loss_clip": 0.01219455, + "auxiliary_loss_mlp": 0.01081985, + "balance_loss_clip": 1.07044816, + "balance_loss_mlp": 1.0007323, + "epoch": 0.0679372332110864, + "flos": 70865830788480.0, + "grad_norm": 0.8893156729932049, + "language_loss": 0.6388315, + "learning_rate": 3.984966517844523e-06, + "loss": 0.66184592, + "num_input_tokens_seen": 12057255, + "step": 565, + "time_per_iteration": 3.2170889377593994 + }, + { + "auxiliary_loss_clip": 0.01227953, + "auxiliary_loss_mlp": 0.01097126, + "balance_loss_clip": 1.06667018, + "balance_loss_mlp": 1.01153362, + "epoch": 0.06805747610172548, + "flos": 28256418990720.0, + "grad_norm": 2.1126307671960225, + "language_loss": 0.80624664, + "learning_rate": 3.984871036367492e-06, + "loss": 0.82949746, + "num_input_tokens_seen": 12077280, + "step": 566, + "time_per_iteration": 2.771469831466675 + }, + { + "auxiliary_loss_clip": 0.01215077, + "auxiliary_loss_mlp": 0.00875915, + "balance_loss_clip": 1.06343174, + "balance_loss_mlp": 1.00034773, + "epoch": 0.06817771899236458, + "flos": 20120533764480.0, + "grad_norm": 2.5997314338329893, + "language_loss": 0.82820976, + "learning_rate": 3.984775253787102e-06, + "loss": 0.84911966, + "num_input_tokens_seen": 12095570, + "step": 567, + "time_per_iteration": 2.8078463077545166 + }, + { + "auxiliary_loss_clip": 0.01217422, + "auxiliary_loss_mlp": 0.01097951, + "balance_loss_clip": 1.06403446, + "balance_loss_mlp": 1.01216853, + "epoch": 0.06829796188300366, + "flos": 17930629284480.0, + "grad_norm": 2.630887120270693, + "language_loss": 0.87932009, + "learning_rate": 3.984679170117885e-06, + "loss": 0.90247381, + "num_input_tokens_seen": 12111775, + "step": 568, + "time_per_iteration": 2.7498648166656494 + }, + { + "auxiliary_loss_clip": 0.012095, + "auxiliary_loss_mlp": 0.0109497, + "balance_loss_clip": 1.06147826, + "balance_loss_mlp": 1.00980723, + "epoch": 0.06841820477364276, + "flos": 14501627285760.0, + "grad_norm": 2.7484114535304913, + "language_loss": 0.78585804, + "learning_rate": 3.984582785374415e-06, + "loss": 0.80890274, + "num_input_tokens_seen": 12129215, + "step": 569, + "time_per_iteration": 2.68351674079895 + }, + { + "auxiliary_loss_clip": 0.01200998, + "auxiliary_loss_mlp": 0.00875972, + "balance_loss_clip": 1.06020081, + "balance_loss_mlp": 1.00040698, + "epoch": 0.06853844766428185, + "flos": 21938474954880.0, + "grad_norm": 2.2743921726720857, + "language_loss": 0.80442572, + "learning_rate": 3.9844860995713155e-06, + "loss": 0.82519537, + "num_input_tokens_seen": 12148755, + "step": 570, + "time_per_iteration": 2.913588762283325 + }, + { + "auxiliary_loss_clip": 0.01219603, + "auxiliary_loss_mlp": 0.01097548, + "balance_loss_clip": 1.06821656, + "balance_loss_mlp": 1.01271904, + "epoch": 0.06865869055492094, + "flos": 16800628348800.0, + "grad_norm": 2.288795630199342, + "language_loss": 0.83305919, + "learning_rate": 3.9843891127232524e-06, + "loss": 0.85623074, + "num_input_tokens_seen": 12166290, + "step": 571, + "time_per_iteration": 2.963447093963623 + }, + { + "auxiliary_loss_clip": 0.01180699, + "auxiliary_loss_mlp": 0.01097355, + "balance_loss_clip": 1.05711627, + "balance_loss_mlp": 1.01238215, + "epoch": 0.06877893344556003, + "flos": 19937281553280.0, + "grad_norm": 2.421604858154358, + "language_loss": 0.67120433, + "learning_rate": 3.984291824844938e-06, + "loss": 0.69398487, + "num_input_tokens_seen": 12181385, + "step": 572, + "time_per_iteration": 3.0285768508911133 + }, + { + "auxiliary_loss_clip": 0.01227015, + "auxiliary_loss_mlp": 0.01093846, + "balance_loss_clip": 1.06655729, + "balance_loss_mlp": 1.00844431, + "epoch": 0.06889917633619912, + "flos": 23039388852480.0, + "grad_norm": 5.72476763652363, + "language_loss": 0.85251474, + "learning_rate": 3.984194235951132e-06, + "loss": 0.87572336, + "num_input_tokens_seen": 12197530, + "step": 573, + "time_per_iteration": 2.877065420150757 + }, + { + "auxiliary_loss_clip": 0.01227672, + "auxiliary_loss_mlp": 0.01096566, + "balance_loss_clip": 1.06649995, + "balance_loss_mlp": 1.01111698, + "epoch": 0.06901941922683821, + "flos": 20960556203520.0, + "grad_norm": 3.591609392115281, + "language_loss": 0.85003281, + "learning_rate": 3.9840963460566375e-06, + "loss": 0.87327522, + "num_input_tokens_seen": 12216310, + "step": 574, + "time_per_iteration": 2.81193208694458 + }, + { + "auxiliary_loss_clip": 0.01167062, + "auxiliary_loss_mlp": 0.01098377, + "balance_loss_clip": 1.05352902, + "balance_loss_mlp": 1.01345253, + "epoch": 0.06913966211747731, + "flos": 24821850384000.0, + "grad_norm": 1.6069620462862246, + "language_loss": 0.89296305, + "learning_rate": 3.983998155176305e-06, + "loss": 0.91561741, + "num_input_tokens_seen": 12236670, + "step": 575, + "time_per_iteration": 3.0397157669067383 + }, + { + "auxiliary_loss_clip": 0.01214672, + "auxiliary_loss_mlp": 0.01081512, + "balance_loss_clip": 1.06635129, + "balance_loss_mlp": 1.00025904, + "epoch": 0.06925990500811639, + "flos": 58367446957440.0, + "grad_norm": 0.8150534891012687, + "language_loss": 0.57062161, + "learning_rate": 3.9838996633250305e-06, + "loss": 0.59358346, + "num_input_tokens_seen": 12297185, + "step": 576, + "time_per_iteration": 4.076212644577026 + }, + { + "auxiliary_loss_clip": 0.01214803, + "auxiliary_loss_mlp": 0.01099386, + "balance_loss_clip": 1.06193376, + "balance_loss_mlp": 1.01427054, + "epoch": 0.06938014789875549, + "flos": 12749940731520.0, + "grad_norm": 2.2971269770693614, + "language_loss": 0.88287497, + "learning_rate": 3.983800870517753e-06, + "loss": 0.90601683, + "num_input_tokens_seen": 12313975, + "step": 577, + "time_per_iteration": 4.64219856262207 + }, + { + "auxiliary_loss_clip": 0.01210547, + "auxiliary_loss_mlp": 0.01095793, + "balance_loss_clip": 1.06376648, + "balance_loss_mlp": 1.0106771, + "epoch": 0.06950039078939457, + "flos": 22820226019200.0, + "grad_norm": 4.447384741255904, + "language_loss": 0.7822417, + "learning_rate": 3.983701776769463e-06, + "loss": 0.80530512, + "num_input_tokens_seen": 12331385, + "step": 578, + "time_per_iteration": 3.673985242843628 + }, + { + "auxiliary_loss_clip": 0.01214947, + "auxiliary_loss_mlp": 0.01096695, + "balance_loss_clip": 1.06260896, + "balance_loss_mlp": 1.01191354, + "epoch": 0.06962063368003367, + "flos": 21941348042880.0, + "grad_norm": 2.126677345824887, + "language_loss": 0.85733449, + "learning_rate": 3.9836023820951885e-06, + "loss": 0.88045096, + "num_input_tokens_seen": 12350600, + "step": 579, + "time_per_iteration": 2.7535412311553955 + }, + { + "auxiliary_loss_clip": 0.01175627, + "auxiliary_loss_mlp": 0.01096244, + "balance_loss_clip": 1.05024195, + "balance_loss_mlp": 1.0113194, + "epoch": 0.06974087657067275, + "flos": 20706021452160.0, + "grad_norm": 1.9408033151300845, + "language_loss": 0.68235022, + "learning_rate": 3.983502686510011e-06, + "loss": 0.70506895, + "num_input_tokens_seen": 12371430, + "step": 580, + "time_per_iteration": 2.7946958541870117 + }, + { + "auxiliary_loss_clip": 0.01213317, + "auxiliary_loss_mlp": 0.00875933, + "balance_loss_clip": 1.06109738, + "balance_loss_mlp": 1.0004729, + "epoch": 0.06986111946131185, + "flos": 22638230784000.0, + "grad_norm": 2.057826567076249, + "language_loss": 0.73547196, + "learning_rate": 3.9834026900290525e-06, + "loss": 0.75636446, + "num_input_tokens_seen": 12390825, + "step": 581, + "time_per_iteration": 2.765484094619751 + }, + { + "auxiliary_loss_clip": 0.01222706, + "auxiliary_loss_mlp": 0.01098468, + "balance_loss_clip": 1.06193948, + "balance_loss_mlp": 1.01320922, + "epoch": 0.06998136235195095, + "flos": 26943453152640.0, + "grad_norm": 1.9588182232476512, + "language_loss": 1.00148058, + "learning_rate": 3.983302392667482e-06, + "loss": 1.0246923, + "num_input_tokens_seen": 12411670, + "step": 582, + "time_per_iteration": 2.8242881298065186 + }, + { + "auxiliary_loss_clip": 0.01206421, + "auxiliary_loss_mlp": 0.0109388, + "balance_loss_clip": 1.05790782, + "balance_loss_mlp": 1.00905073, + "epoch": 0.07010160524259003, + "flos": 22492505306880.0, + "grad_norm": 1.7666737536437298, + "language_loss": 0.93515217, + "learning_rate": 3.983201794440517e-06, + "loss": 0.95815516, + "num_input_tokens_seen": 12431245, + "step": 583, + "time_per_iteration": 2.752251148223877 + }, + { + "auxiliary_loss_clip": 0.0120152, + "auxiliary_loss_mlp": 0.01098552, + "balance_loss_clip": 1.05972397, + "balance_loss_mlp": 1.01329386, + "epoch": 0.07022184813322913, + "flos": 18332541538560.0, + "grad_norm": 1.9914830891632207, + "language_loss": 0.67689538, + "learning_rate": 3.9831008953634165e-06, + "loss": 0.6998961, + "num_input_tokens_seen": 12450535, + "step": 584, + "time_per_iteration": 2.764240026473999 + }, + { + "auxiliary_loss_clip": 0.01168031, + "auxiliary_loss_mlp": 0.01095986, + "balance_loss_clip": 1.0568167, + "balance_loss_mlp": 1.01101351, + "epoch": 0.07034209102386821, + "flos": 24675550289280.0, + "grad_norm": 2.5144488805368845, + "language_loss": 0.81203318, + "learning_rate": 3.9829996954514864e-06, + "loss": 0.8346734, + "num_input_tokens_seen": 12469675, + "step": 585, + "time_per_iteration": 2.8826005458831787 + }, + { + "auxiliary_loss_clip": 0.01212119, + "auxiliary_loss_mlp": 0.01096918, + "balance_loss_clip": 1.06022954, + "balance_loss_mlp": 1.01194572, + "epoch": 0.0704623339145073, + "flos": 25995878415360.0, + "grad_norm": 2.032710187435613, + "language_loss": 0.83990765, + "learning_rate": 3.982898194720079e-06, + "loss": 0.86299795, + "num_input_tokens_seen": 12490405, + "step": 586, + "time_per_iteration": 2.7679386138916016 + }, + { + "auxiliary_loss_clip": 0.01204037, + "auxiliary_loss_mlp": 0.00876028, + "balance_loss_clip": 1.06129968, + "balance_loss_mlp": 1.00058782, + "epoch": 0.0705825768051464, + "flos": 25338318088320.0, + "grad_norm": 2.1274855710146983, + "language_loss": 0.82284117, + "learning_rate": 3.982796393184592e-06, + "loss": 0.84364188, + "num_input_tokens_seen": 12509485, + "step": 587, + "time_per_iteration": 2.827526569366455 + }, + { + "auxiliary_loss_clip": 0.01201868, + "auxiliary_loss_mlp": 0.01082492, + "balance_loss_clip": 1.06255627, + "balance_loss_mlp": 1.00123918, + "epoch": 0.07070281969578548, + "flos": 66047552507520.0, + "grad_norm": 0.7988000931651562, + "language_loss": 0.62671459, + "learning_rate": 3.98269429086047e-06, + "loss": 0.64955819, + "num_input_tokens_seen": 12567325, + "step": 588, + "time_per_iteration": 3.169646978378296 + }, + { + "auxiliary_loss_clip": 0.01194816, + "auxiliary_loss_mlp": 0.01094988, + "balance_loss_clip": 1.06105638, + "balance_loss_mlp": 1.00949132, + "epoch": 0.07082306258642458, + "flos": 23653568528640.0, + "grad_norm": 2.709336088095099, + "language_loss": 0.86196434, + "learning_rate": 3.982591887763199e-06, + "loss": 0.88486242, + "num_input_tokens_seen": 12584785, + "step": 589, + "time_per_iteration": 2.9629406929016113 + }, + { + "auxiliary_loss_clip": 0.01185284, + "auxiliary_loss_mlp": 0.01099646, + "balance_loss_clip": 1.05805278, + "balance_loss_mlp": 1.01433945, + "epoch": 0.07094330547706366, + "flos": 13880049408000.0, + "grad_norm": 2.2773042652623783, + "language_loss": 0.81795931, + "learning_rate": 3.982489183908316e-06, + "loss": 0.84080863, + "num_input_tokens_seen": 12601205, + "step": 590, + "time_per_iteration": 2.8030450344085693 + }, + { + "auxiliary_loss_clip": 0.01150963, + "auxiliary_loss_mlp": 0.01095839, + "balance_loss_clip": 1.05142117, + "balance_loss_mlp": 1.01115298, + "epoch": 0.07106354836770276, + "flos": 24645098534400.0, + "grad_norm": 1.7606345107908183, + "language_loss": 0.84614885, + "learning_rate": 3.982386179311399e-06, + "loss": 0.86861688, + "num_input_tokens_seen": 12621725, + "step": 591, + "time_per_iteration": 2.9675307273864746 + }, + { + "auxiliary_loss_clip": 0.01209218, + "auxiliary_loss_mlp": 0.01099536, + "balance_loss_clip": 1.05912113, + "balance_loss_mlp": 1.01375294, + "epoch": 0.07118379125834184, + "flos": 16217223649920.0, + "grad_norm": 2.841694904206885, + "language_loss": 0.87960494, + "learning_rate": 3.982282873988075e-06, + "loss": 0.9026925, + "num_input_tokens_seen": 12639600, + "step": 592, + "time_per_iteration": 2.8574347496032715 + }, + { + "auxiliary_loss_clip": 0.01200683, + "auxiliary_loss_mlp": 0.01096836, + "balance_loss_clip": 1.05970764, + "balance_loss_mlp": 1.01234007, + "epoch": 0.07130403414898094, + "flos": 19719986227200.0, + "grad_norm": 1.8630262703172362, + "language_loss": 0.87245929, + "learning_rate": 3.982179267954016e-06, + "loss": 0.89543444, + "num_input_tokens_seen": 12660030, + "step": 593, + "time_per_iteration": 2.8781235218048096 + }, + { + "auxiliary_loss_clip": 0.01219448, + "auxiliary_loss_mlp": 0.01095835, + "balance_loss_clip": 1.0601182, + "balance_loss_mlp": 1.01062393, + "epoch": 0.07142427703962004, + "flos": 21871933009920.0, + "grad_norm": 2.2698764292551847, + "language_loss": 0.95868278, + "learning_rate": 3.982075361224937e-06, + "loss": 0.9818356, + "num_input_tokens_seen": 12678395, + "step": 594, + "time_per_iteration": 2.819833993911743 + }, + { + "auxiliary_loss_clip": 0.01203173, + "auxiliary_loss_mlp": 0.00875888, + "balance_loss_clip": 1.0551697, + "balance_loss_mlp": 1.00047195, + "epoch": 0.07154451993025912, + "flos": 18296595002880.0, + "grad_norm": 1.8953211071685312, + "language_loss": 0.87971759, + "learning_rate": 3.981971153816602e-06, + "loss": 0.90050817, + "num_input_tokens_seen": 12696000, + "step": 595, + "time_per_iteration": 2.8625969886779785 + }, + { + "auxiliary_loss_clip": 0.01224959, + "auxiliary_loss_mlp": 0.01098049, + "balance_loss_clip": 1.06593299, + "balance_loss_mlp": 1.01298141, + "epoch": 0.07166476282089822, + "flos": 22160690444160.0, + "grad_norm": 1.6084708648701955, + "language_loss": 0.9615556, + "learning_rate": 3.981866645744819e-06, + "loss": 0.98478574, + "num_input_tokens_seen": 12716715, + "step": 596, + "time_per_iteration": 2.7670645713806152 + }, + { + "auxiliary_loss_clip": 0.01221887, + "auxiliary_loss_mlp": 0.00875983, + "balance_loss_clip": 1.06217086, + "balance_loss_mlp": 1.00038922, + "epoch": 0.0717850057115373, + "flos": 14136343925760.0, + "grad_norm": 2.4961125053561437, + "language_loss": 0.81621403, + "learning_rate": 3.9817618370254416e-06, + "loss": 0.83719277, + "num_input_tokens_seen": 12733370, + "step": 597, + "time_per_iteration": 2.765669345855713 + }, + { + "auxiliary_loss_clip": 0.0122125, + "auxiliary_loss_mlp": 0.01098634, + "balance_loss_clip": 1.0608933, + "balance_loss_mlp": 1.01313734, + "epoch": 0.0719052486021764, + "flos": 30917794412160.0, + "grad_norm": 2.2824773574589514, + "language_loss": 0.87209392, + "learning_rate": 3.9816567276743684e-06, + "loss": 0.89529276, + "num_input_tokens_seen": 12753235, + "step": 598, + "time_per_iteration": 2.842088222503662 + }, + { + "auxiliary_loss_clip": 0.0119567, + "auxiliary_loss_mlp": 0.01094181, + "balance_loss_clip": 1.05712795, + "balance_loss_mlp": 1.00916064, + "epoch": 0.0720254914928155, + "flos": 21287019939840.0, + "grad_norm": 1.8533489790502808, + "language_loss": 0.77582771, + "learning_rate": 3.9815513177075466e-06, + "loss": 0.7987262, + "num_input_tokens_seen": 12772020, + "step": 599, + "time_per_iteration": 2.8086342811584473 + }, + { + "auxiliary_loss_clip": 0.01213102, + "auxiliary_loss_mlp": 0.01099998, + "balance_loss_clip": 1.06189489, + "balance_loss_mlp": 1.01478767, + "epoch": 0.07214573438345458, + "flos": 27819170732160.0, + "grad_norm": 1.5564806262704087, + "language_loss": 0.70270413, + "learning_rate": 3.9814456071409646e-06, + "loss": 0.72583514, + "num_input_tokens_seen": 12792555, + "step": 600, + "time_per_iteration": 2.8331305980682373 + }, + { + "auxiliary_loss_clip": 0.01170542, + "auxiliary_loss_mlp": 0.01094543, + "balance_loss_clip": 1.05082011, + "balance_loss_mlp": 1.00933218, + "epoch": 0.07226597727409367, + "flos": 25483576688640.0, + "grad_norm": 2.7983638450766497, + "language_loss": 0.85262412, + "learning_rate": 3.981339595990659e-06, + "loss": 0.87527502, + "num_input_tokens_seen": 12811085, + "step": 601, + "time_per_iteration": 2.8363418579101562 + }, + { + "auxiliary_loss_clip": 0.01206117, + "auxiliary_loss_mlp": 0.01096046, + "balance_loss_clip": 1.05960429, + "balance_loss_mlp": 1.0104537, + "epoch": 0.07238622016473276, + "flos": 23513840622720.0, + "grad_norm": 1.9446277450841751, + "language_loss": 0.81069827, + "learning_rate": 3.981233284272713e-06, + "loss": 0.83371991, + "num_input_tokens_seen": 12830830, + "step": 602, + "time_per_iteration": 4.64277720451355 + }, + { + "auxiliary_loss_clip": 0.01186093, + "auxiliary_loss_mlp": 0.01098946, + "balance_loss_clip": 1.05452383, + "balance_loss_mlp": 1.01421177, + "epoch": 0.07250646305537185, + "flos": 25453519983360.0, + "grad_norm": 1.5920828679021262, + "language_loss": 0.90008259, + "learning_rate": 3.981126672003253e-06, + "loss": 0.92293298, + "num_input_tokens_seen": 12853505, + "step": 603, + "time_per_iteration": 4.031603574752808 + }, + { + "auxiliary_loss_clip": 0.01201701, + "auxiliary_loss_mlp": 0.01096323, + "balance_loss_clip": 1.05962896, + "balance_loss_mlp": 1.01130271, + "epoch": 0.07262670594601094, + "flos": 27155038216320.0, + "grad_norm": 2.0422741546257708, + "language_loss": 0.77905631, + "learning_rate": 3.981019759198451e-06, + "loss": 0.80203652, + "num_input_tokens_seen": 12872455, + "step": 604, + "time_per_iteration": 3.8823115825653076 + }, + { + "auxiliary_loss_clip": 0.01191196, + "auxiliary_loss_mlp": 0.01097321, + "balance_loss_clip": 1.05064666, + "balance_loss_mlp": 1.01230121, + "epoch": 0.07274694883665003, + "flos": 26651607148800.0, + "grad_norm": 2.001203856774291, + "language_loss": 0.84427977, + "learning_rate": 3.980912545874528e-06, + "loss": 0.86716503, + "num_input_tokens_seen": 12892620, + "step": 605, + "time_per_iteration": 2.7994399070739746 + }, + { + "auxiliary_loss_clip": 0.0121182, + "auxiliary_loss_mlp": 0.00875906, + "balance_loss_clip": 1.05976224, + "balance_loss_mlp": 1.00036454, + "epoch": 0.07286719172728913, + "flos": 29862344154240.0, + "grad_norm": 2.076165366050726, + "language_loss": 0.85562223, + "learning_rate": 3.980805032047746e-06, + "loss": 0.87649953, + "num_input_tokens_seen": 12914090, + "step": 606, + "time_per_iteration": 2.8289051055908203 + }, + { + "auxiliary_loss_clip": 0.01188915, + "auxiliary_loss_mlp": 0.01096675, + "balance_loss_clip": 1.05372202, + "balance_loss_mlp": 1.01141679, + "epoch": 0.07298743461792821, + "flos": 17382057799680.0, + "grad_norm": 1.8903188331690437, + "language_loss": 0.81288147, + "learning_rate": 3.980697217734415e-06, + "loss": 0.83573735, + "num_input_tokens_seen": 12931830, + "step": 607, + "time_per_iteration": 2.7851052284240723 + }, + { + "auxiliary_loss_clip": 0.01171431, + "auxiliary_loss_mlp": 0.00875828, + "balance_loss_clip": 1.04983306, + "balance_loss_mlp": 1.00031435, + "epoch": 0.07310767750856731, + "flos": 19498201701120.0, + "grad_norm": 1.8224415435750698, + "language_loss": 0.9143548, + "learning_rate": 3.980589102950891e-06, + "loss": 0.93482739, + "num_input_tokens_seen": 12949995, + "step": 608, + "time_per_iteration": 2.8843343257904053 + }, + { + "auxiliary_loss_clip": 0.01194739, + "auxiliary_loss_mlp": 0.01099579, + "balance_loss_clip": 1.05868649, + "balance_loss_mlp": 1.01417768, + "epoch": 0.07322792039920639, + "flos": 29168693637120.0, + "grad_norm": 2.8992297126192392, + "language_loss": 0.76137304, + "learning_rate": 3.9804806877135755e-06, + "loss": 0.78431618, + "num_input_tokens_seen": 12968040, + "step": 609, + "time_per_iteration": 2.877615213394165 + }, + { + "auxiliary_loss_clip": 0.01210212, + "auxiliary_loss_mlp": 0.00875997, + "balance_loss_clip": 1.05890012, + "balance_loss_mlp": 1.00035715, + "epoch": 0.07334816328984549, + "flos": 23477822259840.0, + "grad_norm": 2.0007352948256245, + "language_loss": 0.86255872, + "learning_rate": 3.980371972038915e-06, + "loss": 0.88342083, + "num_input_tokens_seen": 12988530, + "step": 610, + "time_per_iteration": 2.7494144439697266 + }, + { + "auxiliary_loss_clip": 0.01222968, + "auxiliary_loss_mlp": 0.01097, + "balance_loss_clip": 1.06378961, + "balance_loss_mlp": 1.0117898, + "epoch": 0.07346840618048459, + "flos": 22962467877120.0, + "grad_norm": 18.910507131170885, + "language_loss": 0.84246182, + "learning_rate": 3.980262955943399e-06, + "loss": 0.8656615, + "num_input_tokens_seen": 13008195, + "step": 611, + "time_per_iteration": 2.759277582168579 + }, + { + "auxiliary_loss_clip": 0.01190443, + "auxiliary_loss_mlp": 0.01097916, + "balance_loss_clip": 1.05424047, + "balance_loss_mlp": 1.01308644, + "epoch": 0.07358864907112367, + "flos": 17673903803520.0, + "grad_norm": 2.5780740320169913, + "language_loss": 0.86353296, + "learning_rate": 3.980153639443569e-06, + "loss": 0.88641655, + "num_input_tokens_seen": 13024180, + "step": 612, + "time_per_iteration": 2.7262637615203857 + }, + { + "auxiliary_loss_clip": 0.01195677, + "auxiliary_loss_mlp": 0.01098439, + "balance_loss_clip": 1.05406952, + "balance_loss_mlp": 1.01341927, + "epoch": 0.07370889196176277, + "flos": 24097029840000.0, + "grad_norm": 1.9927096603956862, + "language_loss": 0.80266738, + "learning_rate": 3.980044022556005e-06, + "loss": 0.82560849, + "num_input_tokens_seen": 13043865, + "step": 613, + "time_per_iteration": 2.8334925174713135 + }, + { + "auxiliary_loss_clip": 0.01203421, + "auxiliary_loss_mlp": 0.01100113, + "balance_loss_clip": 1.05515313, + "balance_loss_mlp": 1.01437759, + "epoch": 0.07382913485240185, + "flos": 25885919905920.0, + "grad_norm": 2.240952725610536, + "language_loss": 0.72917837, + "learning_rate": 3.9799341052973375e-06, + "loss": 0.75221372, + "num_input_tokens_seen": 13063700, + "step": 614, + "time_per_iteration": 2.7475526332855225 + }, + { + "auxiliary_loss_clip": 0.01194111, + "auxiliary_loss_mlp": 0.01098335, + "balance_loss_clip": 1.05647063, + "balance_loss_mlp": 1.01274228, + "epoch": 0.07394937774304094, + "flos": 16873850223360.0, + "grad_norm": 2.4665609830540367, + "language_loss": 0.7506187, + "learning_rate": 3.979823887684241e-06, + "loss": 0.77354312, + "num_input_tokens_seen": 13082640, + "step": 615, + "time_per_iteration": 2.8640732765197754 + }, + { + "auxiliary_loss_clip": 0.01221815, + "auxiliary_loss_mlp": 0.01098533, + "balance_loss_clip": 1.06249595, + "balance_loss_mlp": 1.01327419, + "epoch": 0.07406962063368003, + "flos": 20703471586560.0, + "grad_norm": 2.312912434648907, + "language_loss": 0.84535122, + "learning_rate": 3.979713369733434e-06, + "loss": 0.86855471, + "num_input_tokens_seen": 13100505, + "step": 616, + "time_per_iteration": 2.712679147720337 + }, + { + "auxiliary_loss_clip": 0.01211411, + "auxiliary_loss_mlp": 0.01098265, + "balance_loss_clip": 1.0598731, + "balance_loss_mlp": 1.01281595, + "epoch": 0.07418986352431912, + "flos": 21430985650560.0, + "grad_norm": 2.557033577831677, + "language_loss": 0.84551668, + "learning_rate": 3.979602551461683e-06, + "loss": 0.86861348, + "num_input_tokens_seen": 13121285, + "step": 617, + "time_per_iteration": 2.81797194480896 + }, + { + "auxiliary_loss_clip": 0.01195699, + "auxiliary_loss_mlp": 0.01095754, + "balance_loss_clip": 1.05679989, + "balance_loss_mlp": 1.01044798, + "epoch": 0.07431010641495822, + "flos": 12021133777920.0, + "grad_norm": 3.69402209109464, + "language_loss": 0.91820729, + "learning_rate": 3.979491432885799e-06, + "loss": 0.94112182, + "num_input_tokens_seen": 13137550, + "step": 618, + "time_per_iteration": 2.8583240509033203 + }, + { + "auxiliary_loss_clip": 0.01178566, + "auxiliary_loss_mlp": 0.00875803, + "balance_loss_clip": 1.05497742, + "balance_loss_mlp": 1.00030065, + "epoch": 0.0744303493055973, + "flos": 20957575374720.0, + "grad_norm": 2.5864230922100315, + "language_loss": 0.83133972, + "learning_rate": 3.97938001402264e-06, + "loss": 0.85188347, + "num_input_tokens_seen": 13156675, + "step": 619, + "time_per_iteration": 2.7455790042877197 + }, + { + "auxiliary_loss_clip": 0.01171693, + "auxiliary_loss_mlp": 0.01101109, + "balance_loss_clip": 1.05154181, + "balance_loss_mlp": 1.01565957, + "epoch": 0.0745505921962364, + "flos": 16253134272000.0, + "grad_norm": 3.636237237249551, + "language_loss": 0.79283094, + "learning_rate": 3.979268294889105e-06, + "loss": 0.81555897, + "num_input_tokens_seen": 13172225, + "step": 620, + "time_per_iteration": 2.778491973876953 + }, + { + "auxiliary_loss_clip": 0.01221396, + "auxiliary_loss_mlp": 0.01096026, + "balance_loss_clip": 1.06263065, + "balance_loss_mlp": 1.01072013, + "epoch": 0.07467083508687548, + "flos": 50944635550080.0, + "grad_norm": 2.4870434889359525, + "language_loss": 0.74031055, + "learning_rate": 3.979156275502143e-06, + "loss": 0.76348478, + "num_input_tokens_seen": 13195885, + "step": 621, + "time_per_iteration": 2.9205217361450195 + }, + { + "auxiliary_loss_clip": 0.01186018, + "auxiliary_loss_mlp": 0.01096907, + "balance_loss_clip": 1.0558784, + "balance_loss_mlp": 1.01164818, + "epoch": 0.07479107797751458, + "flos": 17529686697600.0, + "grad_norm": 2.2859427697329333, + "language_loss": 0.91522926, + "learning_rate": 3.979043955878749e-06, + "loss": 0.9380585, + "num_input_tokens_seen": 13213730, + "step": 622, + "time_per_iteration": 2.818408966064453 + }, + { + "auxiliary_loss_clip": 0.01191847, + "auxiliary_loss_mlp": 0.01099938, + "balance_loss_clip": 1.05619335, + "balance_loss_mlp": 1.01429844, + "epoch": 0.07491132086815366, + "flos": 23473943591040.0, + "grad_norm": 2.167397827097311, + "language_loss": 0.83202994, + "learning_rate": 3.978931336035959e-06, + "loss": 0.85494781, + "num_input_tokens_seen": 13232540, + "step": 623, + "time_per_iteration": 2.7959139347076416 + }, + { + "auxiliary_loss_clip": 0.01208811, + "auxiliary_loss_mlp": 0.01097591, + "balance_loss_clip": 1.06038725, + "balance_loss_mlp": 1.01185608, + "epoch": 0.07503156375879276, + "flos": 20157557708160.0, + "grad_norm": 2.161076171245895, + "language_loss": 0.82211316, + "learning_rate": 3.9788184159908595e-06, + "loss": 0.84517717, + "num_input_tokens_seen": 13249670, + "step": 624, + "time_per_iteration": 2.689629077911377 + }, + { + "auxiliary_loss_clip": 0.01192569, + "auxiliary_loss_mlp": 0.01101938, + "balance_loss_clip": 1.05239606, + "balance_loss_mlp": 1.01667988, + "epoch": 0.07515180664943186, + "flos": 15115519653120.0, + "grad_norm": 11.28834655395278, + "language_loss": 0.82844722, + "learning_rate": 3.97870519576058e-06, + "loss": 0.85139227, + "num_input_tokens_seen": 13266095, + "step": 625, + "time_per_iteration": 2.777644634246826 + }, + { + "auxiliary_loss_clip": 0.01184195, + "auxiliary_loss_mlp": 0.00875824, + "balance_loss_clip": 1.05420494, + "balance_loss_mlp": 1.00029302, + "epoch": 0.07527204954007094, + "flos": 21287702298240.0, + "grad_norm": 4.764471839662345, + "language_loss": 0.80785763, + "learning_rate": 3.978591675362295e-06, + "loss": 0.82845783, + "num_input_tokens_seen": 13284810, + "step": 626, + "time_per_iteration": 2.8418641090393066 + }, + { + "auxiliary_loss_clip": 0.01171246, + "auxiliary_loss_mlp": 0.01098208, + "balance_loss_clip": 1.05394745, + "balance_loss_mlp": 1.01299679, + "epoch": 0.07539229243071004, + "flos": 21324187537920.0, + "grad_norm": 1.8500346054104566, + "language_loss": 0.8752172, + "learning_rate": 3.978477854813226e-06, + "loss": 0.89791179, + "num_input_tokens_seen": 13304150, + "step": 627, + "time_per_iteration": 3.734428882598877 + }, + { + "auxiliary_loss_clip": 0.01210241, + "auxiliary_loss_mlp": 0.01098724, + "balance_loss_clip": 1.05971456, + "balance_loss_mlp": 1.01365674, + "epoch": 0.07551253532134912, + "flos": 13042540920960.0, + "grad_norm": 2.884939465456137, + "language_loss": 0.82181561, + "learning_rate": 3.97836373413064e-06, + "loss": 0.84490526, + "num_input_tokens_seen": 13322205, + "step": 628, + "time_per_iteration": 4.606187582015991 + }, + { + "auxiliary_loss_clip": 0.01216859, + "auxiliary_loss_mlp": 0.01096286, + "balance_loss_clip": 1.05843663, + "balance_loss_mlp": 1.01078963, + "epoch": 0.07563277821198822, + "flos": 19208761908480.0, + "grad_norm": 2.9644137349383586, + "language_loss": 0.74715632, + "learning_rate": 3.978249313331848e-06, + "loss": 0.77028775, + "num_input_tokens_seen": 13340435, + "step": 629, + "time_per_iteration": 3.7660412788391113 + }, + { + "auxiliary_loss_clip": 0.0121058, + "auxiliary_loss_mlp": 0.00875869, + "balance_loss_clip": 1.06018019, + "balance_loss_mlp": 1.00026667, + "epoch": 0.07575302110262731, + "flos": 19537200892800.0, + "grad_norm": 3.4465949823382287, + "language_loss": 0.6218726, + "learning_rate": 3.978134592434208e-06, + "loss": 0.64273715, + "num_input_tokens_seen": 13358185, + "step": 630, + "time_per_iteration": 2.7634172439575195 + }, + { + "auxiliary_loss_clip": 0.01156963, + "auxiliary_loss_mlp": 0.01087271, + "balance_loss_clip": 1.04841411, + "balance_loss_mlp": 1.00563622, + "epoch": 0.0758732639932664, + "flos": 67961808017280.0, + "grad_norm": 1.0050932597551963, + "language_loss": 0.5946756, + "learning_rate": 3.978019571455123e-06, + "loss": 0.61711794, + "num_input_tokens_seen": 13410130, + "step": 631, + "time_per_iteration": 3.5613749027252197 + }, + { + "auxiliary_loss_clip": 0.01218725, + "auxiliary_loss_mlp": 0.0109909, + "balance_loss_clip": 1.06023812, + "balance_loss_mlp": 1.01392698, + "epoch": 0.07599350688390549, + "flos": 18989204025600.0, + "grad_norm": 6.186299995764397, + "language_loss": 0.84005344, + "learning_rate": 3.977904250412042e-06, + "loss": 0.8632316, + "num_input_tokens_seen": 13429085, + "step": 632, + "time_per_iteration": 3.0075156688690186 + }, + { + "auxiliary_loss_clip": 0.01198185, + "auxiliary_loss_mlp": 0.01098869, + "balance_loss_clip": 1.05684972, + "balance_loss_mlp": 1.01341939, + "epoch": 0.07611374977454458, + "flos": 21069006341760.0, + "grad_norm": 2.15505885536428, + "language_loss": 0.85433912, + "learning_rate": 3.97778862932246e-06, + "loss": 0.87730962, + "num_input_tokens_seen": 13446250, + "step": 633, + "time_per_iteration": 2.8340060710906982 + }, + { + "auxiliary_loss_clip": 0.01131207, + "auxiliary_loss_mlp": 0.01096248, + "balance_loss_clip": 1.04430771, + "balance_loss_mlp": 1.0108459, + "epoch": 0.07623399266518367, + "flos": 18514536773760.0, + "grad_norm": 2.6988113232723263, + "language_loss": 0.93936467, + "learning_rate": 3.9776727082039144e-06, + "loss": 0.96163917, + "num_input_tokens_seen": 13463220, + "step": 634, + "time_per_iteration": 2.9944515228271484 + }, + { + "auxiliary_loss_clip": 0.01212299, + "auxiliary_loss_mlp": 0.01082637, + "balance_loss_clip": 1.0666697, + "balance_loss_mlp": 1.00100219, + "epoch": 0.07635423555582276, + "flos": 44663036077440.0, + "grad_norm": 0.8137694057347054, + "language_loss": 0.55492723, + "learning_rate": 3.977556487073991e-06, + "loss": 0.57787657, + "num_input_tokens_seen": 13517775, + "step": 635, + "time_per_iteration": 3.166349411010742 + }, + { + "auxiliary_loss_clip": 0.01197689, + "auxiliary_loss_mlp": 0.01097166, + "balance_loss_clip": 1.05591559, + "balance_loss_mlp": 1.01185966, + "epoch": 0.07647447844646185, + "flos": 21761148487680.0, + "grad_norm": 1.883799360584277, + "language_loss": 0.8137657, + "learning_rate": 3.97743996595032e-06, + "loss": 0.83671427, + "num_input_tokens_seen": 13537815, + "step": 636, + "time_per_iteration": 2.823408842086792 + }, + { + "auxiliary_loss_clip": 0.01218601, + "auxiliary_loss_mlp": 0.01097352, + "balance_loss_clip": 1.06043053, + "balance_loss_mlp": 1.01180696, + "epoch": 0.07659472133710095, + "flos": 23806799948160.0, + "grad_norm": 1.8479374530280375, + "language_loss": 0.81609321, + "learning_rate": 3.9773231448505804e-06, + "loss": 0.83925271, + "num_input_tokens_seen": 13559605, + "step": 637, + "time_per_iteration": 2.683788299560547 + }, + { + "auxiliary_loss_clip": 0.01190562, + "auxiliary_loss_mlp": 0.00875781, + "balance_loss_clip": 1.0542661, + "balance_loss_mlp": 1.00023329, + "epoch": 0.07671496422774003, + "flos": 21469984842240.0, + "grad_norm": 2.812157877032177, + "language_loss": 0.78193414, + "learning_rate": 3.977206023792491e-06, + "loss": 0.80259752, + "num_input_tokens_seen": 13579495, + "step": 638, + "time_per_iteration": 2.8271822929382324 + }, + { + "auxiliary_loss_clip": 0.01204336, + "auxiliary_loss_mlp": 0.01098186, + "balance_loss_clip": 1.05697072, + "balance_loss_mlp": 1.01245022, + "epoch": 0.07683520711837913, + "flos": 16980971558400.0, + "grad_norm": 2.5119953797859815, + "language_loss": 0.81402421, + "learning_rate": 3.97708860279382e-06, + "loss": 0.83704948, + "num_input_tokens_seen": 13597605, + "step": 639, + "time_per_iteration": 2.7106876373291016 + }, + { + "auxiliary_loss_clip": 0.01186599, + "auxiliary_loss_mlp": 0.01098583, + "balance_loss_clip": 1.05503988, + "balance_loss_mlp": 1.01299095, + "epoch": 0.07695545000901821, + "flos": 23476744851840.0, + "grad_norm": 3.84617802412808, + "language_loss": 0.78100187, + "learning_rate": 3.97697088187238e-06, + "loss": 0.80385369, + "num_input_tokens_seen": 13618120, + "step": 640, + "time_per_iteration": 2.8456058502197266 + }, + { + "auxiliary_loss_clip": 0.01195358, + "auxiliary_loss_mlp": 0.01095056, + "balance_loss_clip": 1.0578711, + "balance_loss_mlp": 1.01008403, + "epoch": 0.07707569289965731, + "flos": 17634258167040.0, + "grad_norm": 2.6614218141801143, + "language_loss": 0.92132956, + "learning_rate": 3.976852861046029e-06, + "loss": 0.94423372, + "num_input_tokens_seen": 13634735, + "step": 641, + "time_per_iteration": 2.8302173614501953 + }, + { + "auxiliary_loss_clip": 0.01172398, + "auxiliary_loss_mlp": 0.01096893, + "balance_loss_clip": 1.0523684, + "balance_loss_mlp": 1.01192021, + "epoch": 0.0771959357902964, + "flos": 25775674087680.0, + "grad_norm": 1.656546834417622, + "language_loss": 0.80368173, + "learning_rate": 3.97673454033267e-06, + "loss": 0.82637465, + "num_input_tokens_seen": 13656835, + "step": 642, + "time_per_iteration": 2.8416011333465576 + }, + { + "auxiliary_loss_clip": 0.01194143, + "auxiliary_loss_mlp": 0.01096199, + "balance_loss_clip": 1.05431128, + "balance_loss_mlp": 1.01117933, + "epoch": 0.07731617868093549, + "flos": 19828651847040.0, + "grad_norm": 2.7240126925360757, + "language_loss": 0.82514483, + "learning_rate": 3.976615919750254e-06, + "loss": 0.84804833, + "num_input_tokens_seen": 13674535, + "step": 643, + "time_per_iteration": 2.8869144916534424 + }, + { + "auxiliary_loss_clip": 0.01204536, + "auxiliary_loss_mlp": 0.01097468, + "balance_loss_clip": 1.05771184, + "balance_loss_mlp": 1.01230526, + "epoch": 0.07743642157157458, + "flos": 21324654414720.0, + "grad_norm": 1.8863504056049185, + "language_loss": 0.86993814, + "learning_rate": 3.976496999316775e-06, + "loss": 0.8929581, + "num_input_tokens_seen": 13693290, + "step": 644, + "time_per_iteration": 2.827075481414795 + }, + { + "auxiliary_loss_clip": 0.01188762, + "auxiliary_loss_mlp": 0.01097184, + "balance_loss_clip": 1.0537231, + "balance_loss_mlp": 1.01192522, + "epoch": 0.07755666446221367, + "flos": 19969133938560.0, + "grad_norm": 1.9142236555311605, + "language_loss": 0.84112668, + "learning_rate": 3.976377779050271e-06, + "loss": 0.86398619, + "num_input_tokens_seen": 13711420, + "step": 645, + "time_per_iteration": 2.77377986907959 + }, + { + "auxiliary_loss_clip": 0.01197486, + "auxiliary_loss_mlp": 0.01096804, + "balance_loss_clip": 1.05630231, + "balance_loss_mlp": 1.01125896, + "epoch": 0.07767690735285276, + "flos": 23623224514560.0, + "grad_norm": 1.9626929100921728, + "language_loss": 0.84178329, + "learning_rate": 3.976258258968831e-06, + "loss": 0.86472619, + "num_input_tokens_seen": 13729965, + "step": 646, + "time_per_iteration": 2.8034591674804688 + }, + { + "auxiliary_loss_clip": 0.01173878, + "auxiliary_loss_mlp": 0.01095689, + "balance_loss_clip": 1.04933381, + "balance_loss_mlp": 1.01062107, + "epoch": 0.07779715024349185, + "flos": 22236246702720.0, + "grad_norm": 2.4649043223299656, + "language_loss": 0.74274063, + "learning_rate": 3.976138439090583e-06, + "loss": 0.76543629, + "num_input_tokens_seen": 13748045, + "step": 647, + "time_per_iteration": 3.10040545463562 + }, + { + "auxiliary_loss_clip": 0.01182468, + "auxiliary_loss_mlp": 0.01096571, + "balance_loss_clip": 1.05464029, + "balance_loss_mlp": 1.01126432, + "epoch": 0.07791739313413094, + "flos": 20955097336320.0, + "grad_norm": 2.2136214429511205, + "language_loss": 0.85179514, + "learning_rate": 3.976018319433706e-06, + "loss": 0.87458551, + "num_input_tokens_seen": 13765590, + "step": 648, + "time_per_iteration": 2.9174857139587402 + }, + { + "auxiliary_loss_clip": 0.01204834, + "auxiliary_loss_mlp": 0.01095078, + "balance_loss_clip": 1.05702472, + "balance_loss_mlp": 1.00953317, + "epoch": 0.07803763602477004, + "flos": 19312327797120.0, + "grad_norm": 2.302046393813191, + "language_loss": 0.90983617, + "learning_rate": 3.9758979000164205e-06, + "loss": 0.93283528, + "num_input_tokens_seen": 13782410, + "step": 649, + "time_per_iteration": 2.864283323287964 + }, + { + "auxiliary_loss_clip": 0.01187395, + "auxiliary_loss_mlp": 0.01095825, + "balance_loss_clip": 1.05692148, + "balance_loss_mlp": 1.01037538, + "epoch": 0.07815787891540912, + "flos": 22710806213760.0, + "grad_norm": 4.906207704223628, + "language_loss": 0.7218973, + "learning_rate": 3.975777180856995e-06, + "loss": 0.74472958, + "num_input_tokens_seen": 13801530, + "step": 650, + "time_per_iteration": 2.888679265975952 + }, + { + "auxiliary_loss_clip": 0.01218034, + "auxiliary_loss_mlp": 0.01097629, + "balance_loss_clip": 1.05995083, + "balance_loss_mlp": 1.01213241, + "epoch": 0.07827812180604822, + "flos": 22711129436160.0, + "grad_norm": 2.0432836516300243, + "language_loss": 0.85845411, + "learning_rate": 3.975656161973742e-06, + "loss": 0.88161075, + "num_input_tokens_seen": 13820615, + "step": 651, + "time_per_iteration": 2.747731924057007 + }, + { + "auxiliary_loss_clip": 0.01217097, + "auxiliary_loss_mlp": 0.01099281, + "balance_loss_clip": 1.05959964, + "balance_loss_mlp": 1.01368856, + "epoch": 0.0783983646966873, + "flos": 21725597001600.0, + "grad_norm": 2.501769676239467, + "language_loss": 0.88491178, + "learning_rate": 3.9755348433850194e-06, + "loss": 0.90807551, + "num_input_tokens_seen": 13835955, + "step": 652, + "time_per_iteration": 3.6412580013275146 + }, + { + "auxiliary_loss_clip": 0.01179762, + "auxiliary_loss_mlp": 0.01081536, + "balance_loss_clip": 1.06296563, + "balance_loss_mlp": 1.00028265, + "epoch": 0.0785186075873264, + "flos": 60640877537280.0, + "grad_norm": 0.9721406348122728, + "language_loss": 0.63637841, + "learning_rate": 3.975413225109232e-06, + "loss": 0.65899146, + "num_input_tokens_seen": 13896505, + "step": 653, + "time_per_iteration": 4.285202741622925 + }, + { + "auxiliary_loss_clip": 0.01204045, + "auxiliary_loss_mlp": 0.01098216, + "balance_loss_clip": 1.05632961, + "balance_loss_mlp": 1.01252866, + "epoch": 0.0786388504779655, + "flos": 23877902920320.0, + "grad_norm": 3.3446386235537866, + "language_loss": 0.93603194, + "learning_rate": 3.975291307164829e-06, + "loss": 0.95905453, + "num_input_tokens_seen": 13915150, + "step": 654, + "time_per_iteration": 3.838984727859497 + }, + { + "auxiliary_loss_clip": 0.01179539, + "auxiliary_loss_mlp": 0.0109612, + "balance_loss_clip": 1.04991698, + "balance_loss_mlp": 1.01148176, + "epoch": 0.07875909336860458, + "flos": 15158684822400.0, + "grad_norm": 1.9732907469124608, + "language_loss": 0.84905565, + "learning_rate": 3.975169089570306e-06, + "loss": 0.87181222, + "num_input_tokens_seen": 13933525, + "step": 655, + "time_per_iteration": 3.79719614982605 + }, + { + "auxiliary_loss_clip": 0.01198788, + "auxiliary_loss_mlp": 0.01098661, + "balance_loss_clip": 1.05858696, + "balance_loss_mlp": 1.01359344, + "epoch": 0.07887933625924368, + "flos": 22236857233920.0, + "grad_norm": 2.2264723295317888, + "language_loss": 0.91766131, + "learning_rate": 3.975046572344202e-06, + "loss": 0.9406358, + "num_input_tokens_seen": 13949985, + "step": 656, + "time_per_iteration": 2.987896680831909 + }, + { + "auxiliary_loss_clip": 0.01179663, + "auxiliary_loss_mlp": 0.01095133, + "balance_loss_clip": 1.0555526, + "balance_loss_mlp": 1.01016009, + "epoch": 0.07899957914988276, + "flos": 20777734955520.0, + "grad_norm": 2.045390439268704, + "language_loss": 0.71088564, + "learning_rate": 3.974923755505103e-06, + "loss": 0.73363364, + "num_input_tokens_seen": 13969215, + "step": 657, + "time_per_iteration": 2.954979658126831 + }, + { + "auxiliary_loss_clip": 0.01188141, + "auxiliary_loss_mlp": 0.01096536, + "balance_loss_clip": 1.05729234, + "balance_loss_mlp": 1.01156378, + "epoch": 0.07911982204052186, + "flos": 23003047267200.0, + "grad_norm": 1.8270819093188602, + "language_loss": 0.91087991, + "learning_rate": 3.974800639071641e-06, + "loss": 0.93372667, + "num_input_tokens_seen": 13989935, + "step": 658, + "time_per_iteration": 2.8759639263153076 + }, + { + "auxiliary_loss_clip": 0.01161041, + "auxiliary_loss_mlp": 0.00875703, + "balance_loss_clip": 1.05110049, + "balance_loss_mlp": 1.00007486, + "epoch": 0.07924006493116094, + "flos": 23111389664640.0, + "grad_norm": 2.09071877829175, + "language_loss": 1.00573969, + "learning_rate": 3.974677223062492e-06, + "loss": 1.02610719, + "num_input_tokens_seen": 14007150, + "step": 659, + "time_per_iteration": 2.951857566833496 + }, + { + "auxiliary_loss_clip": 0.01187992, + "auxiliary_loss_mlp": 0.01097227, + "balance_loss_clip": 1.05442357, + "balance_loss_mlp": 1.01225495, + "epoch": 0.07936030782180004, + "flos": 16472153450880.0, + "grad_norm": 2.3114261851596862, + "language_loss": 0.74285549, + "learning_rate": 3.974553507496378e-06, + "loss": 0.76570767, + "num_input_tokens_seen": 14025725, + "step": 660, + "time_per_iteration": 2.7567594051361084 + }, + { + "auxiliary_loss_clip": 0.0118305, + "auxiliary_loss_mlp": 0.01098646, + "balance_loss_clip": 1.05492353, + "balance_loss_mlp": 1.01295865, + "epoch": 0.07948055071243913, + "flos": 23733290764800.0, + "grad_norm": 15.516238693001773, + "language_loss": 0.89012325, + "learning_rate": 3.974429492392068e-06, + "loss": 0.9129402, + "num_input_tokens_seen": 14045750, + "step": 661, + "time_per_iteration": 2.7867023944854736 + }, + { + "auxiliary_loss_clip": 0.01216386, + "auxiliary_loss_mlp": 0.00875625, + "balance_loss_clip": 1.05900288, + "balance_loss_mlp": 1.00006807, + "epoch": 0.07960079360307822, + "flos": 19573326996480.0, + "grad_norm": 2.639066313407411, + "language_loss": 0.91018003, + "learning_rate": 3.974305177768373e-06, + "loss": 0.93110013, + "num_input_tokens_seen": 14063960, + "step": 662, + "time_per_iteration": 2.7042829990386963 + }, + { + "auxiliary_loss_clip": 0.01182554, + "auxiliary_loss_mlp": 0.01100319, + "balance_loss_clip": 1.05435944, + "balance_loss_mlp": 1.01491785, + "epoch": 0.07972103649371731, + "flos": 23513409659520.0, + "grad_norm": 2.161275745335635, + "language_loss": 0.86402261, + "learning_rate": 3.974180563644152e-06, + "loss": 0.88685131, + "num_input_tokens_seen": 14082525, + "step": 663, + "time_per_iteration": 2.828674793243408 + }, + { + "auxiliary_loss_clip": 0.01194571, + "auxiliary_loss_mlp": 0.01098482, + "balance_loss_clip": 1.05524182, + "balance_loss_mlp": 1.01327133, + "epoch": 0.0798412793843564, + "flos": 16726867770240.0, + "grad_norm": 2.2528792556154515, + "language_loss": 0.88884312, + "learning_rate": 3.97405565003831e-06, + "loss": 0.91177368, + "num_input_tokens_seen": 14098610, + "step": 664, + "time_per_iteration": 2.816803216934204 + }, + { + "auxiliary_loss_clip": 0.01174512, + "auxiliary_loss_mlp": 0.01096487, + "balance_loss_clip": 1.04881394, + "balance_loss_mlp": 1.01160955, + "epoch": 0.07996152227499549, + "flos": 18223337214720.0, + "grad_norm": 3.9952620061708326, + "language_loss": 0.78268814, + "learning_rate": 3.973930436969794e-06, + "loss": 0.80539811, + "num_input_tokens_seen": 14117065, + "step": 665, + "time_per_iteration": 2.8534421920776367 + }, + { + "auxiliary_loss_clip": 0.01189437, + "auxiliary_loss_mlp": 0.01098466, + "balance_loss_clip": 1.0570848, + "balance_loss_mlp": 1.01316023, + "epoch": 0.08008176516563459, + "flos": 20594877793920.0, + "grad_norm": 14.37389302703078, + "language_loss": 0.85796392, + "learning_rate": 3.973804924457602e-06, + "loss": 0.88084304, + "num_input_tokens_seen": 14135145, + "step": 666, + "time_per_iteration": 2.8145196437835693 + }, + { + "auxiliary_loss_clip": 0.01188714, + "auxiliary_loss_mlp": 0.01097415, + "balance_loss_clip": 1.05671012, + "balance_loss_mlp": 1.01225245, + "epoch": 0.08020200805627367, + "flos": 31834306863360.0, + "grad_norm": 1.7504205312710053, + "language_loss": 0.85768569, + "learning_rate": 3.973679112520771e-06, + "loss": 0.88054693, + "num_input_tokens_seen": 14156860, + "step": 667, + "time_per_iteration": 2.8034465312957764 + }, + { + "auxiliary_loss_clip": 0.01179524, + "auxiliary_loss_mlp": 0.01097487, + "balance_loss_clip": 1.05174232, + "balance_loss_mlp": 1.0123713, + "epoch": 0.08032225094691277, + "flos": 17783503176960.0, + "grad_norm": 6.928495036937417, + "language_loss": 0.99133241, + "learning_rate": 3.973553001178389e-06, + "loss": 1.01410258, + "num_input_tokens_seen": 14174365, + "step": 668, + "time_per_iteration": 2.8630623817443848 + }, + { + "auxiliary_loss_clip": 0.01179271, + "auxiliary_loss_mlp": 0.01094938, + "balance_loss_clip": 1.05215192, + "balance_loss_mlp": 1.01006126, + "epoch": 0.08044249383755185, + "flos": 24061693835520.0, + "grad_norm": 2.082082685929652, + "language_loss": 0.76028764, + "learning_rate": 3.973426590449585e-06, + "loss": 0.78302968, + "num_input_tokens_seen": 14192320, + "step": 669, + "time_per_iteration": 2.8344221115112305 + }, + { + "auxiliary_loss_clip": 0.01167034, + "auxiliary_loss_mlp": 0.01100507, + "balance_loss_clip": 1.05106258, + "balance_loss_mlp": 1.01500988, + "epoch": 0.08056273672819095, + "flos": 18223624523520.0, + "grad_norm": 2.4824948791104826, + "language_loss": 0.75152457, + "learning_rate": 3.9732998803535364e-06, + "loss": 0.77420002, + "num_input_tokens_seen": 14210380, + "step": 670, + "time_per_iteration": 2.9041154384613037 + }, + { + "auxiliary_loss_clip": 0.01215582, + "auxiliary_loss_mlp": 0.01095722, + "balance_loss_clip": 1.05810452, + "balance_loss_mlp": 1.01055884, + "epoch": 0.08068297961883003, + "flos": 19676856971520.0, + "grad_norm": 2.3246421909636856, + "language_loss": 0.85030246, + "learning_rate": 3.973172870909465e-06, + "loss": 0.87341547, + "num_input_tokens_seen": 14225145, + "step": 671, + "time_per_iteration": 2.7596945762634277 + }, + { + "auxiliary_loss_clip": 0.01195449, + "auxiliary_loss_mlp": 0.01097429, + "balance_loss_clip": 1.05627227, + "balance_loss_mlp": 1.01159835, + "epoch": 0.08080322250946913, + "flos": 23148736830720.0, + "grad_norm": 2.3072432978732844, + "language_loss": 0.80886221, + "learning_rate": 3.973045562136638e-06, + "loss": 0.83179098, + "num_input_tokens_seen": 14241960, + "step": 672, + "time_per_iteration": 2.8041133880615234 + }, + { + "auxiliary_loss_clip": 0.01208006, + "auxiliary_loss_mlp": 0.01095605, + "balance_loss_clip": 1.05862474, + "balance_loss_mlp": 1.01044142, + "epoch": 0.08092346540010822, + "flos": 21763626526080.0, + "grad_norm": 2.0866697078103886, + "language_loss": 0.91570091, + "learning_rate": 3.972917954054368e-06, + "loss": 0.93873703, + "num_input_tokens_seen": 14260515, + "step": 673, + "time_per_iteration": 2.7318623065948486 + }, + { + "auxiliary_loss_clip": 0.01198336, + "auxiliary_loss_mlp": 0.01097591, + "balance_loss_clip": 1.05888653, + "balance_loss_mlp": 1.01166487, + "epoch": 0.08104370829074731, + "flos": 21032485188480.0, + "grad_norm": 2.4163810042438683, + "language_loss": 0.8186847, + "learning_rate": 3.972790046682013e-06, + "loss": 0.84164393, + "num_input_tokens_seen": 14279190, + "step": 674, + "time_per_iteration": 2.7382776737213135 + }, + { + "auxiliary_loss_clip": 0.01175867, + "auxiliary_loss_mlp": 0.01095878, + "balance_loss_clip": 1.04671621, + "balance_loss_mlp": 1.01057196, + "epoch": 0.0811639511813864, + "flos": 20083186598400.0, + "grad_norm": 1.8996396941404534, + "language_loss": 0.7889384, + "learning_rate": 3.972661840038977e-06, + "loss": 0.81165588, + "num_input_tokens_seen": 14299480, + "step": 675, + "time_per_iteration": 2.9617350101470947 + }, + { + "auxiliary_loss_clip": 0.01200561, + "auxiliary_loss_mlp": 0.01098253, + "balance_loss_clip": 1.05360365, + "balance_loss_mlp": 1.01275575, + "epoch": 0.08128419407202549, + "flos": 16836718538880.0, + "grad_norm": 2.2490996851875105, + "language_loss": 0.83250833, + "learning_rate": 3.972533334144707e-06, + "loss": 0.85549653, + "num_input_tokens_seen": 14316405, + "step": 676, + "time_per_iteration": 2.804591417312622 + }, + { + "auxiliary_loss_clip": 0.01207057, + "auxiliary_loss_mlp": 0.01095427, + "balance_loss_clip": 1.05856454, + "balance_loss_mlp": 1.00983441, + "epoch": 0.08140443696266458, + "flos": 23769273214080.0, + "grad_norm": 1.969790632850233, + "language_loss": 0.7852515, + "learning_rate": 3.972404529018699e-06, + "loss": 0.80827636, + "num_input_tokens_seen": 14336265, + "step": 677, + "time_per_iteration": 3.737384080886841 + }, + { + "auxiliary_loss_clip": 0.01193766, + "auxiliary_loss_mlp": 0.01096084, + "balance_loss_clip": 1.05202889, + "balance_loss_mlp": 1.01101649, + "epoch": 0.08152467985330367, + "flos": 24390132819840.0, + "grad_norm": 2.267074901075785, + "language_loss": 0.85251367, + "learning_rate": 3.972275424680493e-06, + "loss": 0.87541223, + "num_input_tokens_seen": 14356375, + "step": 678, + "time_per_iteration": 3.8012683391571045 + }, + { + "auxiliary_loss_clip": 0.01214447, + "auxiliary_loss_mlp": 0.01094985, + "balance_loss_clip": 1.05778825, + "balance_loss_mlp": 1.00967908, + "epoch": 0.08164492274394276, + "flos": 19317750750720.0, + "grad_norm": 2.543213210476715, + "language_loss": 0.91842842, + "learning_rate": 3.972146021149673e-06, + "loss": 0.94152272, + "num_input_tokens_seen": 14374650, + "step": 679, + "time_per_iteration": 3.731121063232422 + }, + { + "auxiliary_loss_clip": 0.01170094, + "auxiliary_loss_mlp": 0.0109783, + "balance_loss_clip": 1.04314935, + "balance_loss_mlp": 1.01300049, + "epoch": 0.08176516563458186, + "flos": 14830461319680.0, + "grad_norm": 2.5618441567267105, + "language_loss": 0.78632224, + "learning_rate": 3.972016318445868e-06, + "loss": 0.80900145, + "num_input_tokens_seen": 14392650, + "step": 680, + "time_per_iteration": 3.7227025032043457 + }, + { + "auxiliary_loss_clip": 0.01202051, + "auxiliary_loss_mlp": 0.01098854, + "balance_loss_clip": 1.05427694, + "balance_loss_mlp": 1.01326168, + "epoch": 0.08188540852522094, + "flos": 22602320161920.0, + "grad_norm": 1.924173676087029, + "language_loss": 0.92449003, + "learning_rate": 3.971886316588757e-06, + "loss": 0.9474991, + "num_input_tokens_seen": 14413155, + "step": 681, + "time_per_iteration": 2.846747398376465 + }, + { + "auxiliary_loss_clip": 0.01173302, + "auxiliary_loss_mlp": 0.01097609, + "balance_loss_clip": 1.05567575, + "balance_loss_mlp": 1.01220703, + "epoch": 0.08200565141586004, + "flos": 19463727623040.0, + "grad_norm": 3.2108411415999547, + "language_loss": 0.73600429, + "learning_rate": 3.9717560155980595e-06, + "loss": 0.75871342, + "num_input_tokens_seen": 14428805, + "step": 682, + "time_per_iteration": 2.8027760982513428 + }, + { + "auxiliary_loss_clip": 0.01202239, + "auxiliary_loss_mlp": 0.01094923, + "balance_loss_clip": 1.05519581, + "balance_loss_mlp": 1.00990272, + "epoch": 0.08212589430649912, + "flos": 20594662312320.0, + "grad_norm": 2.3278632093720333, + "language_loss": 0.91472197, + "learning_rate": 3.971625415493542e-06, + "loss": 0.9376936, + "num_input_tokens_seen": 14447125, + "step": 683, + "time_per_iteration": 2.7522435188293457 + }, + { + "auxiliary_loss_clip": 0.01179032, + "auxiliary_loss_mlp": 0.01095224, + "balance_loss_clip": 1.05133724, + "balance_loss_mlp": 1.01006043, + "epoch": 0.08224613719713822, + "flos": 25953611086080.0, + "grad_norm": 2.179140410123176, + "language_loss": 0.87489372, + "learning_rate": 3.971494516295017e-06, + "loss": 0.89763629, + "num_input_tokens_seen": 14466575, + "step": 684, + "time_per_iteration": 2.9177393913269043 + }, + { + "auxiliary_loss_clip": 0.01178316, + "auxiliary_loss_mlp": 0.01099204, + "balance_loss_clip": 1.04991102, + "balance_loss_mlp": 1.01399279, + "epoch": 0.08236638008777732, + "flos": 23768734510080.0, + "grad_norm": 1.9242910744198451, + "language_loss": 0.85010588, + "learning_rate": 3.971363318022341e-06, + "loss": 0.87288111, + "num_input_tokens_seen": 14487915, + "step": 685, + "time_per_iteration": 2.7466213703155518 + }, + { + "auxiliary_loss_clip": 0.01196038, + "auxiliary_loss_mlp": 0.01097487, + "balance_loss_clip": 1.05481684, + "balance_loss_mlp": 1.01189423, + "epoch": 0.0824866229784164, + "flos": 38799144887040.0, + "grad_norm": 1.9971018569586105, + "language_loss": 0.6844089, + "learning_rate": 3.971231820695417e-06, + "loss": 0.70734417, + "num_input_tokens_seen": 14511530, + "step": 686, + "time_per_iteration": 2.9572789669036865 + }, + { + "auxiliary_loss_clip": 0.01195169, + "auxiliary_loss_mlp": 0.01098648, + "balance_loss_clip": 1.05562282, + "balance_loss_mlp": 1.01353264, + "epoch": 0.0826068658690555, + "flos": 23107762391040.0, + "grad_norm": 2.693165951334655, + "language_loss": 0.81448436, + "learning_rate": 3.971100024334193e-06, + "loss": 0.83742255, + "num_input_tokens_seen": 14529050, + "step": 687, + "time_per_iteration": 2.774238348007202 + }, + { + "auxiliary_loss_clip": 0.01174051, + "auxiliary_loss_mlp": 0.01097489, + "balance_loss_clip": 1.05292416, + "balance_loss_mlp": 1.01242125, + "epoch": 0.08272710875969458, + "flos": 21136374299520.0, + "grad_norm": 2.466177520340911, + "language_loss": 0.86102825, + "learning_rate": 3.970967928958663e-06, + "loss": 0.88374364, + "num_input_tokens_seen": 14546165, + "step": 688, + "time_per_iteration": 2.8203794956207275 + }, + { + "auxiliary_loss_clip": 0.01173043, + "auxiliary_loss_mlp": 0.01098746, + "balance_loss_clip": 1.05212331, + "balance_loss_mlp": 1.01391673, + "epoch": 0.08284735165033368, + "flos": 19063000517760.0, + "grad_norm": 1.6271307898948897, + "language_loss": 0.83633858, + "learning_rate": 3.970835534588865e-06, + "loss": 0.85905647, + "num_input_tokens_seen": 14563660, + "step": 689, + "time_per_iteration": 2.898449420928955 + }, + { + "auxiliary_loss_clip": 0.01193359, + "auxiliary_loss_mlp": 0.01093346, + "balance_loss_clip": 1.05775702, + "balance_loss_mlp": 1.00865984, + "epoch": 0.08296759454097276, + "flos": 16727442387840.0, + "grad_norm": 1.894703836577559, + "language_loss": 0.86048019, + "learning_rate": 3.970702841244883e-06, + "loss": 0.88334727, + "num_input_tokens_seen": 14581980, + "step": 690, + "time_per_iteration": 2.768564462661743 + }, + { + "auxiliary_loss_clip": 0.01203843, + "auxiliary_loss_mlp": 0.01095269, + "balance_loss_clip": 1.05639291, + "balance_loss_mlp": 1.0102495, + "epoch": 0.08308783743161186, + "flos": 18004928567040.0, + "grad_norm": 2.029409996356348, + "language_loss": 0.82286668, + "learning_rate": 3.970569848946847e-06, + "loss": 0.84585774, + "num_input_tokens_seen": 14601795, + "step": 691, + "time_per_iteration": 2.862496852874756 + }, + { + "auxiliary_loss_clip": 0.01194668, + "auxiliary_loss_mlp": 0.01093595, + "balance_loss_clip": 1.05580509, + "balance_loss_mlp": 1.00867033, + "epoch": 0.08320808032225095, + "flos": 15079788599040.0, + "grad_norm": 2.8097764951467474, + "language_loss": 0.82788742, + "learning_rate": 3.970436557714932e-06, + "loss": 0.85077, + "num_input_tokens_seen": 14618315, + "step": 692, + "time_per_iteration": 2.7169346809387207 + }, + { + "auxiliary_loss_clip": 0.01180425, + "auxiliary_loss_mlp": 0.01096853, + "balance_loss_clip": 1.04951084, + "balance_loss_mlp": 1.0117377, + "epoch": 0.08332832321289003, + "flos": 22383085501440.0, + "grad_norm": 2.408249549955537, + "language_loss": 0.86946762, + "learning_rate": 3.970302967569358e-06, + "loss": 0.89224029, + "num_input_tokens_seen": 14636905, + "step": 693, + "time_per_iteration": 2.738409996032715 + }, + { + "auxiliary_loss_clip": 0.01196301, + "auxiliary_loss_mlp": 0.01098922, + "balance_loss_clip": 1.05174398, + "balance_loss_mlp": 1.01352072, + "epoch": 0.08344856610352913, + "flos": 24717386655360.0, + "grad_norm": 2.7988839786526523, + "language_loss": 0.67800844, + "learning_rate": 3.9701690785303896e-06, + "loss": 0.70096064, + "num_input_tokens_seen": 14656100, + "step": 694, + "time_per_iteration": 2.7342069149017334 + }, + { + "auxiliary_loss_clip": 0.01202991, + "auxiliary_loss_mlp": 0.01093883, + "balance_loss_clip": 1.05569828, + "balance_loss_mlp": 1.00891018, + "epoch": 0.08356880899416821, + "flos": 25370206387200.0, + "grad_norm": 2.009646054220096, + "language_loss": 0.88276047, + "learning_rate": 3.970034890618339e-06, + "loss": 0.90572923, + "num_input_tokens_seen": 14675790, + "step": 695, + "time_per_iteration": 2.763328790664673 + }, + { + "auxiliary_loss_clip": 0.01193497, + "auxiliary_loss_mlp": 0.01097353, + "balance_loss_clip": 1.05408382, + "balance_loss_mlp": 1.01257157, + "epoch": 0.08368905188480731, + "flos": 24353072962560.0, + "grad_norm": 2.2341835232470717, + "language_loss": 0.8800807, + "learning_rate": 3.969900403853562e-06, + "loss": 0.90298927, + "num_input_tokens_seen": 14694830, + "step": 696, + "time_per_iteration": 2.7192223072052 + }, + { + "auxiliary_loss_clip": 0.01212594, + "auxiliary_loss_mlp": 0.0109532, + "balance_loss_clip": 1.05632031, + "balance_loss_mlp": 1.01034808, + "epoch": 0.08380929477544641, + "flos": 18037319656320.0, + "grad_norm": 1.751281733245687, + "language_loss": 0.77969825, + "learning_rate": 3.96976561825646e-06, + "loss": 0.80277735, + "num_input_tokens_seen": 14711920, + "step": 697, + "time_per_iteration": 2.7212674617767334 + }, + { + "auxiliary_loss_clip": 0.01163665, + "auxiliary_loss_mlp": 0.01096592, + "balance_loss_clip": 1.0446918, + "balance_loss_mlp": 1.01171458, + "epoch": 0.08392953766608549, + "flos": 26286287875200.0, + "grad_norm": 5.299493334083844, + "language_loss": 0.86958498, + "learning_rate": 3.969630533847479e-06, + "loss": 0.89218754, + "num_input_tokens_seen": 14730880, + "step": 698, + "time_per_iteration": 2.832357883453369 + }, + { + "auxiliary_loss_clip": 0.01201366, + "auxiliary_loss_mlp": 0.01094524, + "balance_loss_clip": 1.05409276, + "balance_loss_mlp": 1.00983775, + "epoch": 0.08404978055672459, + "flos": 22492146170880.0, + "grad_norm": 2.3621061549419613, + "language_loss": 0.84119308, + "learning_rate": 3.969495150647113e-06, + "loss": 0.86415195, + "num_input_tokens_seen": 14749050, + "step": 699, + "time_per_iteration": 2.856032609939575 + }, + { + "auxiliary_loss_clip": 0.01172469, + "auxiliary_loss_mlp": 0.01100831, + "balance_loss_clip": 1.04717195, + "balance_loss_mlp": 1.01585889, + "epoch": 0.08417002344736367, + "flos": 24826878288000.0, + "grad_norm": 8.820374957364404, + "language_loss": 0.76523018, + "learning_rate": 3.969359468675899e-06, + "loss": 0.78796321, + "num_input_tokens_seen": 14769180, + "step": 700, + "time_per_iteration": 2.9218153953552246 + }, + { + "auxiliary_loss_clip": 0.01198448, + "auxiliary_loss_mlp": 0.01093801, + "balance_loss_clip": 1.05385447, + "balance_loss_mlp": 1.00911462, + "epoch": 0.08429026633800277, + "flos": 16945922862720.0, + "grad_norm": 3.0295295037614043, + "language_loss": 0.8924942, + "learning_rate": 3.969223487954418e-06, + "loss": 0.91541666, + "num_input_tokens_seen": 14786640, + "step": 701, + "time_per_iteration": 2.720918893814087 + }, + { + "auxiliary_loss_clip": 0.01162674, + "auxiliary_loss_mlp": 0.01093999, + "balance_loss_clip": 1.04462671, + "balance_loss_mlp": 1.00931227, + "epoch": 0.08441050922864185, + "flos": 23841920471040.0, + "grad_norm": 2.094228230606666, + "language_loss": 0.82589084, + "learning_rate": 3.969087208503301e-06, + "loss": 0.84845757, + "num_input_tokens_seen": 14806720, + "step": 702, + "time_per_iteration": 2.8553130626678467 + }, + { + "auxiliary_loss_clip": 0.01170448, + "auxiliary_loss_mlp": 0.0109373, + "balance_loss_clip": 1.05161858, + "balance_loss_mlp": 1.00890112, + "epoch": 0.08453075211928095, + "flos": 25520205582720.0, + "grad_norm": 2.1807879371447654, + "language_loss": 0.84463573, + "learning_rate": 3.968950630343219e-06, + "loss": 0.86727756, + "num_input_tokens_seen": 14823705, + "step": 703, + "time_per_iteration": 3.867314577102661 + }, + { + "auxiliary_loss_clip": 0.0119034, + "auxiliary_loss_mlp": 0.01093878, + "balance_loss_clip": 1.0512836, + "balance_loss_mlp": 1.00919151, + "epoch": 0.08465099500992004, + "flos": 19532496211200.0, + "grad_norm": 2.258761663892769, + "language_loss": 0.9350608, + "learning_rate": 3.968813753494892e-06, + "loss": 0.95790297, + "num_input_tokens_seen": 14841865, + "step": 704, + "time_per_iteration": 3.933511257171631 + }, + { + "auxiliary_loss_clip": 0.01180474, + "auxiliary_loss_mlp": 0.00875724, + "balance_loss_clip": 1.05123341, + "balance_loss_mlp": 1.00033844, + "epoch": 0.08477123790055913, + "flos": 29351299403520.0, + "grad_norm": 2.1707680114593373, + "language_loss": 0.75172627, + "learning_rate": 3.968676577979084e-06, + "loss": 0.7722882, + "num_input_tokens_seen": 14861415, + "step": 705, + "time_per_iteration": 2.8660857677459717 + }, + { + "auxiliary_loss_clip": 0.01147445, + "auxiliary_loss_mlp": 0.01095458, + "balance_loss_clip": 1.04515934, + "balance_loss_mlp": 1.01058078, + "epoch": 0.08489148079119822, + "flos": 18624495283200.0, + "grad_norm": 4.891816664112017, + "language_loss": 0.78276139, + "learning_rate": 3.968539103816605e-06, + "loss": 0.80519044, + "num_input_tokens_seen": 14879215, + "step": 706, + "time_per_iteration": 3.7743136882781982 + }, + { + "auxiliary_loss_clip": 0.01175655, + "auxiliary_loss_mlp": 0.00875739, + "balance_loss_clip": 1.04873955, + "balance_loss_mlp": 1.00038362, + "epoch": 0.0850117236818373, + "flos": 23471393725440.0, + "grad_norm": 1.9520478324468011, + "language_loss": 0.89128804, + "learning_rate": 3.9684013310283085e-06, + "loss": 0.91180199, + "num_input_tokens_seen": 14897900, + "step": 707, + "time_per_iteration": 2.739583969116211 + }, + { + "auxiliary_loss_clip": 0.01187861, + "auxiliary_loss_mlp": 0.01096437, + "balance_loss_clip": 1.05243123, + "balance_loss_mlp": 1.01184583, + "epoch": 0.0851319665724764, + "flos": 40625058896640.0, + "grad_norm": 1.9486878002391446, + "language_loss": 0.64213943, + "learning_rate": 3.9682632596350956e-06, + "loss": 0.66498244, + "num_input_tokens_seen": 14919065, + "step": 708, + "time_per_iteration": 2.916283130645752 + }, + { + "auxiliary_loss_clip": 0.01198955, + "auxiliary_loss_mlp": 0.01092708, + "balance_loss_clip": 1.05440164, + "balance_loss_mlp": 1.00806904, + "epoch": 0.0852522094631155, + "flos": 15879554870400.0, + "grad_norm": 2.0314214843815854, + "language_loss": 0.78346795, + "learning_rate": 3.968124889657911e-06, + "loss": 0.80638456, + "num_input_tokens_seen": 14934165, + "step": 709, + "time_per_iteration": 2.6559195518493652 + }, + { + "auxiliary_loss_clip": 0.01162948, + "auxiliary_loss_mlp": 0.01095873, + "balance_loss_clip": 1.04435849, + "balance_loss_mlp": 1.01071048, + "epoch": 0.08537245235375458, + "flos": 14567091822720.0, + "grad_norm": 2.7832411826156505, + "language_loss": 0.91037863, + "learning_rate": 3.967986221117746e-06, + "loss": 0.93296683, + "num_input_tokens_seen": 14950105, + "step": 710, + "time_per_iteration": 2.864032745361328 + }, + { + "auxiliary_loss_clip": 0.01139344, + "auxiliary_loss_mlp": 0.01094103, + "balance_loss_clip": 1.04164124, + "balance_loss_mlp": 1.00936949, + "epoch": 0.08549269524439368, + "flos": 26468929555200.0, + "grad_norm": 4.580500934520186, + "language_loss": 0.86458182, + "learning_rate": 3.967847254035635e-06, + "loss": 0.88691634, + "num_input_tokens_seen": 14969490, + "step": 711, + "time_per_iteration": 3.117532968521118 + }, + { + "auxiliary_loss_clip": 0.01173844, + "auxiliary_loss_mlp": 0.0109506, + "balance_loss_clip": 1.0470196, + "balance_loss_mlp": 1.00999248, + "epoch": 0.08561293813503276, + "flos": 13590214565760.0, + "grad_norm": 2.2333382870009872, + "language_loss": 0.8643266, + "learning_rate": 3.967707988432661e-06, + "loss": 0.88701552, + "num_input_tokens_seen": 14987195, + "step": 712, + "time_per_iteration": 3.0741565227508545 + }, + { + "auxiliary_loss_clip": 0.01210389, + "auxiliary_loss_mlp": 0.01095556, + "balance_loss_clip": 1.05471992, + "balance_loss_mlp": 1.01034498, + "epoch": 0.08573318102567186, + "flos": 26943524979840.0, + "grad_norm": 2.470567813741849, + "language_loss": 0.87843394, + "learning_rate": 3.967568424329949e-06, + "loss": 0.90149343, + "num_input_tokens_seen": 15007620, + "step": 713, + "time_per_iteration": 2.760525703430176 + }, + { + "auxiliary_loss_clip": 0.01188385, + "auxiliary_loss_mlp": 0.01082953, + "balance_loss_clip": 1.07027304, + "balance_loss_mlp": 1.00169945, + "epoch": 0.08585342391631094, + "flos": 67302739319040.0, + "grad_norm": 0.8306664097617826, + "language_loss": 0.55549437, + "learning_rate": 3.967428561748671e-06, + "loss": 0.57820779, + "num_input_tokens_seen": 15075590, + "step": 714, + "time_per_iteration": 3.435896873474121 + }, + { + "auxiliary_loss_clip": 0.01167486, + "auxiliary_loss_mlp": 0.01097073, + "balance_loss_clip": 1.04910886, + "balance_loss_mlp": 1.01162362, + "epoch": 0.08597366680695004, + "flos": 22456594684800.0, + "grad_norm": 2.118581833295777, + "language_loss": 0.87591237, + "learning_rate": 3.967288400710045e-06, + "loss": 0.8985579, + "num_input_tokens_seen": 15095055, + "step": 715, + "time_per_iteration": 2.8567919731140137 + }, + { + "auxiliary_loss_clip": 0.01172743, + "auxiliary_loss_mlp": 0.01093374, + "balance_loss_clip": 1.05054533, + "balance_loss_mlp": 1.00878346, + "epoch": 0.08609390969758914, + "flos": 23550505430400.0, + "grad_norm": 1.836728560244943, + "language_loss": 0.8860305, + "learning_rate": 3.9671479412353335e-06, + "loss": 0.90869164, + "num_input_tokens_seen": 15113520, + "step": 716, + "time_per_iteration": 2.793097734451294 + }, + { + "auxiliary_loss_clip": 0.01197114, + "auxiliary_loss_mlp": 0.01095301, + "balance_loss_clip": 1.05237436, + "balance_loss_mlp": 1.01018512, + "epoch": 0.08621415258822822, + "flos": 25885848078720.0, + "grad_norm": 2.1094924324399598, + "language_loss": 0.74066961, + "learning_rate": 3.967007183345843e-06, + "loss": 0.76359379, + "num_input_tokens_seen": 15133375, + "step": 717, + "time_per_iteration": 2.7374823093414307 + }, + { + "auxiliary_loss_clip": 0.01189823, + "auxiliary_loss_mlp": 0.01095668, + "balance_loss_clip": 1.05024552, + "balance_loss_mlp": 1.01093435, + "epoch": 0.08633439547886732, + "flos": 13589568120960.0, + "grad_norm": 2.849781232342191, + "language_loss": 0.89266491, + "learning_rate": 3.966866127062927e-06, + "loss": 0.91551983, + "num_input_tokens_seen": 15150500, + "step": 718, + "time_per_iteration": 2.6888434886932373 + }, + { + "auxiliary_loss_clip": 0.01197588, + "auxiliary_loss_mlp": 0.01081514, + "balance_loss_clip": 1.06384635, + "balance_loss_mlp": 1.00026059, + "epoch": 0.0864546383695064, + "flos": 57767342434560.0, + "grad_norm": 0.8644517323593242, + "language_loss": 0.62707096, + "learning_rate": 3.966724772407982e-06, + "loss": 0.64986199, + "num_input_tokens_seen": 15208015, + "step": 719, + "time_per_iteration": 3.135873317718506 + }, + { + "auxiliary_loss_clip": 0.01178668, + "auxiliary_loss_mlp": 0.01096149, + "balance_loss_clip": 1.05077076, + "balance_loss_mlp": 1.01141524, + "epoch": 0.0865748812601455, + "flos": 20046952753920.0, + "grad_norm": 2.1004930969626594, + "language_loss": 0.89256823, + "learning_rate": 3.966583119402454e-06, + "loss": 0.91531634, + "num_input_tokens_seen": 15224780, + "step": 720, + "time_per_iteration": 2.7478156089782715 + }, + { + "auxiliary_loss_clip": 0.0119753, + "auxiliary_loss_mlp": 0.00875702, + "balance_loss_clip": 1.05323601, + "balance_loss_mlp": 1.00031495, + "epoch": 0.08669512415078459, + "flos": 35262446935680.0, + "grad_norm": 2.1125897943164174, + "language_loss": 0.82221985, + "learning_rate": 3.9664411680678305e-06, + "loss": 0.84295213, + "num_input_tokens_seen": 15246535, + "step": 721, + "time_per_iteration": 2.9359707832336426 + }, + { + "auxiliary_loss_clip": 0.01175966, + "auxiliary_loss_mlp": 0.01082037, + "balance_loss_clip": 1.05901885, + "balance_loss_mlp": 1.00078416, + "epoch": 0.08681536704142367, + "flos": 65654870048640.0, + "grad_norm": 0.840398812749694, + "language_loss": 0.61457849, + "learning_rate": 3.966298918425644e-06, + "loss": 0.63715851, + "num_input_tokens_seen": 15304025, + "step": 722, + "time_per_iteration": 3.2184417247772217 + }, + { + "auxiliary_loss_clip": 0.012007, + "auxiliary_loss_mlp": 0.01095845, + "balance_loss_clip": 1.05432701, + "balance_loss_mlp": 1.01115894, + "epoch": 0.08693560993206277, + "flos": 34529940881280.0, + "grad_norm": 2.4387324186269113, + "language_loss": 0.82794428, + "learning_rate": 3.966156370497476e-06, + "loss": 0.85090971, + "num_input_tokens_seen": 15327635, + "step": 723, + "time_per_iteration": 2.8551886081695557 + }, + { + "auxiliary_loss_clip": 0.01200567, + "auxiliary_loss_mlp": 0.01095277, + "balance_loss_clip": 1.0543716, + "balance_loss_mlp": 1.01059115, + "epoch": 0.08705585282270185, + "flos": 23149419189120.0, + "grad_norm": 2.9941126706890424, + "language_loss": 0.88975406, + "learning_rate": 3.96601352430495e-06, + "loss": 0.91271257, + "num_input_tokens_seen": 15347405, + "step": 724, + "time_per_iteration": 2.80926775932312 + }, + { + "auxiliary_loss_clip": 0.01186579, + "auxiliary_loss_mlp": 0.01095478, + "balance_loss_clip": 1.05140495, + "balance_loss_mlp": 1.01102996, + "epoch": 0.08717609571334095, + "flos": 29497599498240.0, + "grad_norm": 1.7944708856007896, + "language_loss": 0.82996345, + "learning_rate": 3.965870379869735e-06, + "loss": 0.85278404, + "num_input_tokens_seen": 15369450, + "step": 725, + "time_per_iteration": 2.896383285522461 + }, + { + "auxiliary_loss_clip": 0.01199489, + "auxiliary_loss_mlp": 0.01097731, + "balance_loss_clip": 1.0529089, + "balance_loss_mlp": 1.01280642, + "epoch": 0.08729633860398003, + "flos": 20667489137280.0, + "grad_norm": 2.2501957234215832, + "language_loss": 0.86773139, + "learning_rate": 3.965726937213547e-06, + "loss": 0.89070356, + "num_input_tokens_seen": 15388085, + "step": 726, + "time_per_iteration": 2.7711355686187744 + }, + { + "auxiliary_loss_clip": 0.011987, + "auxiliary_loss_mlp": 0.01097019, + "balance_loss_clip": 1.0510931, + "balance_loss_mlp": 1.0118556, + "epoch": 0.08741658149461913, + "flos": 18369493655040.0, + "grad_norm": 4.677300979574533, + "language_loss": 0.80848294, + "learning_rate": 3.965583196358144e-06, + "loss": 0.83144009, + "num_input_tokens_seen": 15407120, + "step": 727, + "time_per_iteration": 2.753419876098633 + }, + { + "auxiliary_loss_clip": 0.01208122, + "auxiliary_loss_mlp": 0.0109587, + "balance_loss_clip": 1.05348861, + "balance_loss_mlp": 1.01108873, + "epoch": 0.08753682438525823, + "flos": 18729677283840.0, + "grad_norm": 2.0745016823177407, + "language_loss": 0.74467826, + "learning_rate": 3.965439157325335e-06, + "loss": 0.7677182, + "num_input_tokens_seen": 15424485, + "step": 728, + "time_per_iteration": 4.4720141887664795 + }, + { + "auxiliary_loss_clip": 0.01189224, + "auxiliary_loss_mlp": 0.01093579, + "balance_loss_clip": 1.05120325, + "balance_loss_mlp": 1.00879765, + "epoch": 0.08765706727589731, + "flos": 27776113303680.0, + "grad_norm": 1.9614974137158956, + "language_loss": 0.761455, + "learning_rate": 3.965294820136968e-06, + "loss": 0.78428304, + "num_input_tokens_seen": 15446285, + "step": 729, + "time_per_iteration": 3.869706869125366 + }, + { + "auxiliary_loss_clip": 0.01186838, + "auxiliary_loss_mlp": 0.01095177, + "balance_loss_clip": 1.05219591, + "balance_loss_mlp": 1.01015759, + "epoch": 0.08777731016653641, + "flos": 24389127239040.0, + "grad_norm": 2.0744197542627147, + "language_loss": 0.87174308, + "learning_rate": 3.965150184814938e-06, + "loss": 0.89456326, + "num_input_tokens_seen": 15465770, + "step": 730, + "time_per_iteration": 2.8322253227233887 + }, + { + "auxiliary_loss_clip": 0.01193864, + "auxiliary_loss_mlp": 0.01097487, + "balance_loss_clip": 1.05508876, + "balance_loss_mlp": 1.01261044, + "epoch": 0.08789755305717549, + "flos": 21981855605760.0, + "grad_norm": 2.3851772068778634, + "language_loss": 0.76546854, + "learning_rate": 3.965005251381189e-06, + "loss": 0.78838205, + "num_input_tokens_seen": 15483705, + "step": 731, + "time_per_iteration": 2.8200089931488037 + }, + { + "auxiliary_loss_clip": 0.01188472, + "auxiliary_loss_mlp": 0.01080653, + "balance_loss_clip": 1.05545425, + "balance_loss_mlp": 0.99978143, + "epoch": 0.08801779594781459, + "flos": 58360120583040.0, + "grad_norm": 0.9262700800911413, + "language_loss": 0.64626241, + "learning_rate": 3.964860019857705e-06, + "loss": 0.66895366, + "num_input_tokens_seen": 15548620, + "step": 732, + "time_per_iteration": 4.195911884307861 + }, + { + "auxiliary_loss_clip": 0.01211218, + "auxiliary_loss_mlp": 0.01096992, + "balance_loss_clip": 1.05644703, + "balance_loss_mlp": 1.01278245, + "epoch": 0.08813803883845367, + "flos": 23294785530240.0, + "grad_norm": 1.8827261197468903, + "language_loss": 0.83781731, + "learning_rate": 3.964714490266518e-06, + "loss": 0.86089945, + "num_input_tokens_seen": 15569265, + "step": 733, + "time_per_iteration": 2.750507116317749 + }, + { + "auxiliary_loss_clip": 0.01189978, + "auxiliary_loss_mlp": 0.01081135, + "balance_loss_clip": 1.05773735, + "balance_loss_mlp": 0.99988174, + "epoch": 0.08825828172909277, + "flos": 63424924882560.0, + "grad_norm": 0.9126423670431013, + "language_loss": 0.64612162, + "learning_rate": 3.964568662629706e-06, + "loss": 0.66883272, + "num_input_tokens_seen": 15630570, + "step": 734, + "time_per_iteration": 3.156872034072876 + }, + { + "auxiliary_loss_clip": 0.01191291, + "auxiliary_loss_mlp": 0.01096908, + "balance_loss_clip": 1.05364776, + "balance_loss_mlp": 1.01212621, + "epoch": 0.08837852461973186, + "flos": 26720986268160.0, + "grad_norm": 2.9017169176845936, + "language_loss": 0.84232211, + "learning_rate": 3.9644225369693895e-06, + "loss": 0.86520416, + "num_input_tokens_seen": 15650870, + "step": 735, + "time_per_iteration": 2.794024705886841 + }, + { + "auxiliary_loss_clip": 0.01212813, + "auxiliary_loss_mlp": 0.01096476, + "balance_loss_clip": 1.05881715, + "balance_loss_mlp": 1.01178944, + "epoch": 0.08849876751037095, + "flos": 27265427688960.0, + "grad_norm": 2.3610933624128063, + "language_loss": 0.8674469, + "learning_rate": 3.964276113307735e-06, + "loss": 0.89053977, + "num_input_tokens_seen": 15670835, + "step": 736, + "time_per_iteration": 2.7472164630889893 + }, + { + "auxiliary_loss_clip": 0.0117148, + "auxiliary_loss_mlp": 0.01094942, + "balance_loss_clip": 1.04735172, + "balance_loss_mlp": 1.01011288, + "epoch": 0.08861901040101004, + "flos": 19828759587840.0, + "grad_norm": 2.7534444371650317, + "language_loss": 0.8064273, + "learning_rate": 3.9641293916669574e-06, + "loss": 0.82909143, + "num_input_tokens_seen": 15689795, + "step": 737, + "time_per_iteration": 2.7297024726867676 + }, + { + "auxiliary_loss_clip": 0.01175615, + "auxiliary_loss_mlp": 0.01098261, + "balance_loss_clip": 1.04998493, + "balance_loss_mlp": 1.01333594, + "epoch": 0.08873925329164913, + "flos": 23658704173440.0, + "grad_norm": 2.0550179348203614, + "language_loss": 0.82900465, + "learning_rate": 3.9639823720693115e-06, + "loss": 0.85174334, + "num_input_tokens_seen": 15711650, + "step": 738, + "time_per_iteration": 2.9266316890716553 + }, + { + "auxiliary_loss_clip": 0.01153495, + "auxiliary_loss_mlp": 0.01082028, + "balance_loss_clip": 1.03893185, + "balance_loss_mlp": 1.00077462, + "epoch": 0.08885949618228822, + "flos": 71831541893760.0, + "grad_norm": 0.8354998284657804, + "language_loss": 0.60021091, + "learning_rate": 3.963835054537102e-06, + "loss": 0.6225661, + "num_input_tokens_seen": 15780615, + "step": 739, + "time_per_iteration": 3.377394914627075 + }, + { + "auxiliary_loss_clip": 0.01190892, + "auxiliary_loss_mlp": 0.01093929, + "balance_loss_clip": 1.05278194, + "balance_loss_mlp": 1.00962424, + "epoch": 0.08897973907292732, + "flos": 22346169298560.0, + "grad_norm": 2.1620293707444045, + "language_loss": 0.60373533, + "learning_rate": 3.963687439092676e-06, + "loss": 0.62658358, + "num_input_tokens_seen": 15801300, + "step": 740, + "time_per_iteration": 2.784799575805664 + }, + { + "auxiliary_loss_clip": 0.01193635, + "auxiliary_loss_mlp": 0.01095663, + "balance_loss_clip": 1.05199349, + "balance_loss_mlp": 1.01083326, + "epoch": 0.0890999819635664, + "flos": 21251827589760.0, + "grad_norm": 2.383041302204786, + "language_loss": 0.80456269, + "learning_rate": 3.963539525758427e-06, + "loss": 0.82745564, + "num_input_tokens_seen": 15820860, + "step": 741, + "time_per_iteration": 2.790839910507202 + }, + { + "auxiliary_loss_clip": 0.01180639, + "auxiliary_loss_mlp": 0.0109608, + "balance_loss_clip": 1.04593897, + "balance_loss_mlp": 1.01110792, + "epoch": 0.0892202248542055, + "flos": 25370888745600.0, + "grad_norm": 3.620992332673953, + "language_loss": 0.6781677, + "learning_rate": 3.9633913145567925e-06, + "loss": 0.70093495, + "num_input_tokens_seen": 15841350, + "step": 742, + "time_per_iteration": 2.8299200534820557 + }, + { + "auxiliary_loss_clip": 0.01190982, + "auxiliary_loss_mlp": 0.01098646, + "balance_loss_clip": 1.05360711, + "balance_loss_mlp": 1.01415038, + "epoch": 0.08934046774484458, + "flos": 24457895827200.0, + "grad_norm": 5.434862653277608, + "language_loss": 0.81708765, + "learning_rate": 3.9632428055102575e-06, + "loss": 0.83998394, + "num_input_tokens_seen": 15861360, + "step": 743, + "time_per_iteration": 2.8131496906280518 + }, + { + "auxiliary_loss_clip": 0.011989, + "auxiliary_loss_mlp": 0.01096899, + "balance_loss_clip": 1.05349314, + "balance_loss_mlp": 1.0120697, + "epoch": 0.08946071063548368, + "flos": 35772773414400.0, + "grad_norm": 2.286801809084199, + "language_loss": 0.67077649, + "learning_rate": 3.9630939986413495e-06, + "loss": 0.69373453, + "num_input_tokens_seen": 15883160, + "step": 744, + "time_per_iteration": 2.8523590564727783 + }, + { + "auxiliary_loss_clip": 0.0118034, + "auxiliary_loss_mlp": 0.01094971, + "balance_loss_clip": 1.05228758, + "balance_loss_mlp": 1.01080954, + "epoch": 0.08958095352612276, + "flos": 14356584167040.0, + "grad_norm": 1.9793244500903133, + "language_loss": 0.78590065, + "learning_rate": 3.962944893972643e-06, + "loss": 0.80865377, + "num_input_tokens_seen": 15901610, + "step": 745, + "time_per_iteration": 2.852102279663086 + }, + { + "auxiliary_loss_clip": 0.01186694, + "auxiliary_loss_mlp": 0.0109732, + "balance_loss_clip": 1.05236685, + "balance_loss_mlp": 1.01244271, + "epoch": 0.08970119641676186, + "flos": 17853277345920.0, + "grad_norm": 3.2720810712732664, + "language_loss": 0.90689898, + "learning_rate": 3.962795491526756e-06, + "loss": 0.92973912, + "num_input_tokens_seen": 15918770, + "step": 746, + "time_per_iteration": 2.727681875228882 + }, + { + "auxiliary_loss_clip": 0.01208852, + "auxiliary_loss_mlp": 0.01096582, + "balance_loss_clip": 1.05419564, + "balance_loss_mlp": 1.01160991, + "epoch": 0.08982143930740095, + "flos": 20811670329600.0, + "grad_norm": 2.6630970455442124, + "language_loss": 0.88993335, + "learning_rate": 3.962645791326354e-06, + "loss": 0.91298771, + "num_input_tokens_seen": 15938025, + "step": 747, + "time_per_iteration": 2.754143476486206 + }, + { + "auxiliary_loss_clip": 0.01197062, + "auxiliary_loss_mlp": 0.01098457, + "balance_loss_clip": 1.05320108, + "balance_loss_mlp": 1.01391339, + "epoch": 0.08994168219804004, + "flos": 24097712198400.0, + "grad_norm": 1.91697363921839, + "language_loss": 0.82930052, + "learning_rate": 3.962495793394146e-06, + "loss": 0.85225576, + "num_input_tokens_seen": 15957215, + "step": 748, + "time_per_iteration": 2.7794501781463623 + }, + { + "auxiliary_loss_clip": 0.01194135, + "auxiliary_loss_mlp": 0.01081817, + "balance_loss_clip": 1.05269504, + "balance_loss_mlp": 1.00056362, + "epoch": 0.09006192508867913, + "flos": 57188893812480.0, + "grad_norm": 0.7706845330072195, + "language_loss": 0.61230302, + "learning_rate": 3.9623454977528864e-06, + "loss": 0.63506258, + "num_input_tokens_seen": 16015870, + "step": 749, + "time_per_iteration": 3.084900140762329 + }, + { + "auxiliary_loss_clip": 0.01184868, + "auxiliary_loss_mlp": 0.01096445, + "balance_loss_clip": 1.0552783, + "balance_loss_mlp": 1.01156795, + "epoch": 0.09018216797931822, + "flos": 20487505063680.0, + "grad_norm": 2.0076417169160927, + "language_loss": 0.84985989, + "learning_rate": 3.962194904425375e-06, + "loss": 0.87267303, + "num_input_tokens_seen": 16036500, + "step": 750, + "time_per_iteration": 2.899341106414795 + }, + { + "auxiliary_loss_clip": 0.01188642, + "auxiliary_loss_mlp": 0.01096839, + "balance_loss_clip": 1.04937196, + "balance_loss_mlp": 1.01191425, + "epoch": 0.09030241086995731, + "flos": 22638123043200.0, + "grad_norm": 2.083613234544036, + "language_loss": 0.68115205, + "learning_rate": 3.9620440134344566e-06, + "loss": 0.70400685, + "num_input_tokens_seen": 16054655, + "step": 751, + "time_per_iteration": 2.723421812057495 + }, + { + "auxiliary_loss_clip": 0.01168452, + "auxiliary_loss_mlp": 0.01095152, + "balance_loss_clip": 1.04878831, + "balance_loss_mlp": 1.01022756, + "epoch": 0.09042265376059641, + "flos": 21871502046720.0, + "grad_norm": 2.2814902746106016, + "language_loss": 0.82295322, + "learning_rate": 3.9618928248030215e-06, + "loss": 0.84558928, + "num_input_tokens_seen": 16074165, + "step": 752, + "time_per_iteration": 2.9086053371429443 + }, + { + "auxiliary_loss_clip": 0.01194974, + "auxiliary_loss_mlp": 0.01095769, + "balance_loss_clip": 1.05211377, + "balance_loss_mlp": 1.01136947, + "epoch": 0.0905428966512355, + "flos": 24316192673280.0, + "grad_norm": 2.340088000730613, + "language_loss": 0.82841569, + "learning_rate": 3.961741338554005e-06, + "loss": 0.85132313, + "num_input_tokens_seen": 16092505, + "step": 753, + "time_per_iteration": 2.733262300491333 + }, + { + "auxiliary_loss_clip": 0.01190044, + "auxiliary_loss_mlp": 0.01095457, + "balance_loss_clip": 1.05384994, + "balance_loss_mlp": 1.01057982, + "epoch": 0.09066313954187459, + "flos": 35845061535360.0, + "grad_norm": 2.612996214640189, + "language_loss": 0.75523394, + "learning_rate": 3.9615895547103865e-06, + "loss": 0.77808893, + "num_input_tokens_seen": 16116150, + "step": 754, + "time_per_iteration": 4.855449438095093 + }, + { + "auxiliary_loss_clip": 0.01187821, + "auxiliary_loss_mlp": 0.01095056, + "balance_loss_clip": 1.05237627, + "balance_loss_mlp": 1.01017904, + "epoch": 0.09078338243251367, + "flos": 29168729550720.0, + "grad_norm": 1.9753623512743288, + "language_loss": 0.77837706, + "learning_rate": 3.961437473295193e-06, + "loss": 0.80120587, + "num_input_tokens_seen": 16136295, + "step": 755, + "time_per_iteration": 2.8671867847442627 + }, + { + "auxiliary_loss_clip": 0.01171056, + "auxiliary_loss_mlp": 0.0109639, + "balance_loss_clip": 1.04988956, + "balance_loss_mlp": 1.01184678, + "epoch": 0.09090362532315277, + "flos": 21907699977600.0, + "grad_norm": 2.503206275820662, + "language_loss": 0.72401142, + "learning_rate": 3.961285094331495e-06, + "loss": 0.74668592, + "num_input_tokens_seen": 16154210, + "step": 756, + "time_per_iteration": 2.8349177837371826 + }, + { + "auxiliary_loss_clip": 0.01208039, + "auxiliary_loss_mlp": 0.0109648, + "balance_loss_clip": 1.05367303, + "balance_loss_mlp": 1.01188946, + "epoch": 0.09102386821379185, + "flos": 27344503480320.0, + "grad_norm": 1.7890254898065696, + "language_loss": 0.85752159, + "learning_rate": 3.961132417842406e-06, + "loss": 0.88056678, + "num_input_tokens_seen": 16173995, + "step": 757, + "time_per_iteration": 3.6312882900238037 + }, + { + "auxiliary_loss_clip": 0.01199989, + "auxiliary_loss_mlp": 0.01095754, + "balance_loss_clip": 1.05394602, + "balance_loss_mlp": 1.01106787, + "epoch": 0.09114411110443095, + "flos": 20813501923200.0, + "grad_norm": 3.1533124095959315, + "language_loss": 0.752406, + "learning_rate": 3.960979443851089e-06, + "loss": 0.77536339, + "num_input_tokens_seen": 16191020, + "step": 758, + "time_per_iteration": 2.7829504013061523 + }, + { + "auxiliary_loss_clip": 0.01194891, + "auxiliary_loss_mlp": 0.01093131, + "balance_loss_clip": 1.0575583, + "balance_loss_mlp": 1.00830221, + "epoch": 0.09126435399507005, + "flos": 26145949438080.0, + "grad_norm": 1.855552590719341, + "language_loss": 0.78611124, + "learning_rate": 3.96082617238075e-06, + "loss": 0.80899149, + "num_input_tokens_seen": 16213645, + "step": 759, + "time_per_iteration": 2.7984230518341064 + }, + { + "auxiliary_loss_clip": 0.01188152, + "auxiliary_loss_mlp": 0.01094167, + "balance_loss_clip": 1.05243254, + "balance_loss_mlp": 1.00976706, + "epoch": 0.09138459688570913, + "flos": 24388911757440.0, + "grad_norm": 2.9713563326291985, + "language_loss": 0.79903305, + "learning_rate": 3.960672603454639e-06, + "loss": 0.82185626, + "num_input_tokens_seen": 16233625, + "step": 760, + "time_per_iteration": 2.773555278778076 + }, + { + "auxiliary_loss_clip": 0.01200165, + "auxiliary_loss_mlp": 0.01095771, + "balance_loss_clip": 1.05434382, + "balance_loss_mlp": 1.01094127, + "epoch": 0.09150483977634823, + "flos": 21032664756480.0, + "grad_norm": 7.567673981810812, + "language_loss": 0.77142674, + "learning_rate": 3.960518737096054e-06, + "loss": 0.79438615, + "num_input_tokens_seen": 16253255, + "step": 761, + "time_per_iteration": 2.6701714992523193 + }, + { + "auxiliary_loss_clip": 0.01198572, + "auxiliary_loss_mlp": 0.01098025, + "balance_loss_clip": 1.05382764, + "balance_loss_mlp": 1.01357746, + "epoch": 0.09162508266698731, + "flos": 22856998567680.0, + "grad_norm": 2.6008007204064607, + "language_loss": 0.72891116, + "learning_rate": 3.960364573328334e-06, + "loss": 0.75187707, + "num_input_tokens_seen": 16272580, + "step": 762, + "time_per_iteration": 2.6820998191833496 + }, + { + "auxiliary_loss_clip": 0.01174968, + "auxiliary_loss_mlp": 0.01096886, + "balance_loss_clip": 1.0514257, + "balance_loss_mlp": 1.01181793, + "epoch": 0.0917453255576264, + "flos": 21724411852800.0, + "grad_norm": 7.11070157544705, + "language_loss": 0.88851058, + "learning_rate": 3.9602101121748675e-06, + "loss": 0.91122913, + "num_input_tokens_seen": 16293075, + "step": 763, + "time_per_iteration": 2.765181064605713 + }, + { + "auxiliary_loss_clip": 0.01186861, + "auxiliary_loss_mlp": 0.01094463, + "balance_loss_clip": 1.05180275, + "balance_loss_mlp": 1.00987172, + "epoch": 0.0918655684482655, + "flos": 14609215497600.0, + "grad_norm": 1.9988134743251096, + "language_loss": 0.72347617, + "learning_rate": 3.960055353659085e-06, + "loss": 0.74628937, + "num_input_tokens_seen": 16310185, + "step": 764, + "time_per_iteration": 2.710538387298584 + }, + { + "auxiliary_loss_clip": 0.01179128, + "auxiliary_loss_mlp": 0.0109377, + "balance_loss_clip": 1.05303359, + "balance_loss_mlp": 1.00956094, + "epoch": 0.09198581133890459, + "flos": 23435016226560.0, + "grad_norm": 1.7748636948419643, + "language_loss": 0.83440781, + "learning_rate": 3.959900297804465e-06, + "loss": 0.85713685, + "num_input_tokens_seen": 16330355, + "step": 765, + "time_per_iteration": 2.8064231872558594 + }, + { + "auxiliary_loss_clip": 0.01182014, + "auxiliary_loss_mlp": 0.01096107, + "balance_loss_clip": 1.0483129, + "balance_loss_mlp": 1.01137328, + "epoch": 0.09210605422954368, + "flos": 16795887753600.0, + "grad_norm": 1.9312289476096618, + "language_loss": 0.77298158, + "learning_rate": 3.9597449446345276e-06, + "loss": 0.79576278, + "num_input_tokens_seen": 16347600, + "step": 766, + "time_per_iteration": 2.717428207397461 + }, + { + "auxiliary_loss_clip": 0.01193533, + "auxiliary_loss_mlp": 0.0109651, + "balance_loss_clip": 1.05567658, + "balance_loss_mlp": 1.0121572, + "epoch": 0.09222629712018277, + "flos": 22674249146880.0, + "grad_norm": 2.576739229271798, + "language_loss": 0.83262563, + "learning_rate": 3.95958929417284e-06, + "loss": 0.85552609, + "num_input_tokens_seen": 16365755, + "step": 767, + "time_per_iteration": 2.788774013519287 + }, + { + "auxiliary_loss_clip": 0.01196255, + "auxiliary_loss_mlp": 0.0108525, + "balance_loss_clip": 1.06335402, + "balance_loss_mlp": 1.00437832, + "epoch": 0.09234654001082186, + "flos": 69976756327680.0, + "grad_norm": 0.7296033323999647, + "language_loss": 0.58820522, + "learning_rate": 3.9594333464430145e-06, + "loss": 0.61102027, + "num_input_tokens_seen": 16435245, + "step": 768, + "time_per_iteration": 3.3913326263427734 + }, + { + "auxiliary_loss_clip": 0.01137425, + "auxiliary_loss_mlp": 0.01097011, + "balance_loss_clip": 1.04710925, + "balance_loss_mlp": 1.0124197, + "epoch": 0.09246678290146094, + "flos": 20011437181440.0, + "grad_norm": 2.1622810224351823, + "language_loss": 0.88076895, + "learning_rate": 3.959277101468709e-06, + "loss": 0.90311331, + "num_input_tokens_seen": 16454795, + "step": 769, + "time_per_iteration": 2.9609603881835938 + }, + { + "auxiliary_loss_clip": 0.01177481, + "auxiliary_loss_mlp": 0.01096499, + "balance_loss_clip": 1.05784392, + "balance_loss_mlp": 1.01209855, + "epoch": 0.09258702579210004, + "flos": 17747448900480.0, + "grad_norm": 2.6401770968189306, + "language_loss": 0.78653222, + "learning_rate": 3.959120559273624e-06, + "loss": 0.80927205, + "num_input_tokens_seen": 16472580, + "step": 770, + "time_per_iteration": 2.9332098960876465 + }, + { + "auxiliary_loss_clip": 0.01181539, + "auxiliary_loss_mlp": 0.01097248, + "balance_loss_clip": 1.05474806, + "balance_loss_mlp": 1.012609, + "epoch": 0.09270726868273914, + "flos": 20886544229760.0, + "grad_norm": 1.9284160922141544, + "language_loss": 0.83530742, + "learning_rate": 3.958963719881509e-06, + "loss": 0.85809529, + "num_input_tokens_seen": 16490670, + "step": 771, + "time_per_iteration": 2.771437406539917 + }, + { + "auxiliary_loss_clip": 0.01196995, + "auxiliary_loss_mlp": 0.01095524, + "balance_loss_clip": 1.05422139, + "balance_loss_mlp": 1.01098073, + "epoch": 0.09282751157337822, + "flos": 17015697031680.0, + "grad_norm": 3.2805773436434755, + "language_loss": 0.93587518, + "learning_rate": 3.958806583316154e-06, + "loss": 0.95880038, + "num_input_tokens_seen": 16508640, + "step": 772, + "time_per_iteration": 2.7114925384521484 + }, + { + "auxiliary_loss_clip": 0.01212146, + "auxiliary_loss_mlp": 0.0109409, + "balance_loss_clip": 1.05854082, + "balance_loss_mlp": 1.0098331, + "epoch": 0.09294775446401732, + "flos": 32523647748480.0, + "grad_norm": 1.7557406665618416, + "language_loss": 0.78730649, + "learning_rate": 3.9586491496013985e-06, + "loss": 0.81036884, + "num_input_tokens_seen": 16531035, + "step": 773, + "time_per_iteration": 2.731624126434326 + }, + { + "auxiliary_loss_clip": 0.01205459, + "auxiliary_loss_mlp": 0.01095346, + "balance_loss_clip": 1.06001067, + "balance_loss_mlp": 1.01075554, + "epoch": 0.0930679973546564, + "flos": 18259750627200.0, + "grad_norm": 2.3040774709953453, + "language_loss": 0.82758808, + "learning_rate": 3.958491418761124e-06, + "loss": 0.85059619, + "num_input_tokens_seen": 16548605, + "step": 774, + "time_per_iteration": 2.755950450897217 + }, + { + "auxiliary_loss_clip": 0.01189207, + "auxiliary_loss_mlp": 0.01096463, + "balance_loss_clip": 1.05208015, + "balance_loss_mlp": 1.01158631, + "epoch": 0.0931882402452955, + "flos": 21099745405440.0, + "grad_norm": 2.914410115398736, + "language_loss": 0.72753465, + "learning_rate": 3.958333390819258e-06, + "loss": 0.75039142, + "num_input_tokens_seen": 16565535, + "step": 775, + "time_per_iteration": 2.71010684967041 + }, + { + "auxiliary_loss_clip": 0.0120936, + "auxiliary_loss_mlp": 0.01092915, + "balance_loss_clip": 1.05588329, + "balance_loss_mlp": 1.00889659, + "epoch": 0.0933084831359346, + "flos": 24207275658240.0, + "grad_norm": 2.211778958167583, + "language_loss": 0.80157632, + "learning_rate": 3.9581750657997754e-06, + "loss": 0.82459903, + "num_input_tokens_seen": 16584900, + "step": 776, + "time_per_iteration": 2.704251766204834 + }, + { + "auxiliary_loss_clip": 0.01185092, + "auxiliary_loss_mlp": 0.0109259, + "balance_loss_clip": 1.05072856, + "balance_loss_mlp": 1.00852358, + "epoch": 0.09342872602657368, + "flos": 25480272637440.0, + "grad_norm": 1.7678004761311048, + "language_loss": 0.89575714, + "learning_rate": 3.95801644372669e-06, + "loss": 0.91853398, + "num_input_tokens_seen": 16604805, + "step": 777, + "time_per_iteration": 2.8079867362976074 + }, + { + "auxiliary_loss_clip": 0.01190032, + "auxiliary_loss_mlp": 0.01093881, + "balance_loss_clip": 1.05272675, + "balance_loss_mlp": 1.00952911, + "epoch": 0.09354896891721277, + "flos": 23149060053120.0, + "grad_norm": 2.0074940309061446, + "language_loss": 0.84283614, + "learning_rate": 3.957857524624068e-06, + "loss": 0.86567533, + "num_input_tokens_seen": 16623685, + "step": 778, + "time_per_iteration": 2.7790253162384033 + }, + { + "auxiliary_loss_clip": 0.01188841, + "auxiliary_loss_mlp": 0.01099546, + "balance_loss_clip": 1.05418277, + "balance_loss_mlp": 1.01476479, + "epoch": 0.09366921180785186, + "flos": 24279563779200.0, + "grad_norm": 1.7537474444762178, + "language_loss": 0.89563382, + "learning_rate": 3.957698308516016e-06, + "loss": 0.91851771, + "num_input_tokens_seen": 16644985, + "step": 779, + "time_per_iteration": 2.7688307762145996 + }, + { + "auxiliary_loss_clip": 0.01195113, + "auxiliary_loss_mlp": 0.00875419, + "balance_loss_clip": 1.0545969, + "balance_loss_mlp": 1.00017214, + "epoch": 0.09378945469849095, + "flos": 18730036419840.0, + "grad_norm": 3.748397987611424, + "language_loss": 0.82474411, + "learning_rate": 3.957538795426688e-06, + "loss": 0.84544945, + "num_input_tokens_seen": 16662410, + "step": 780, + "time_per_iteration": 4.644619941711426 + }, + { + "auxiliary_loss_clip": 0.01187326, + "auxiliary_loss_mlp": 0.01094909, + "balance_loss_clip": 1.05218816, + "balance_loss_mlp": 1.0106523, + "epoch": 0.09390969758913004, + "flos": 23218834222080.0, + "grad_norm": 4.510107590523317, + "language_loss": 0.77503759, + "learning_rate": 3.9573789853802804e-06, + "loss": 0.79785991, + "num_input_tokens_seen": 16680885, + "step": 781, + "time_per_iteration": 2.7795660495758057 + }, + { + "auxiliary_loss_clip": 0.01187674, + "auxiliary_loss_mlp": 0.0087541, + "balance_loss_clip": 1.05406547, + "balance_loss_mlp": 1.00019217, + "epoch": 0.09402994047976913, + "flos": 19646728439040.0, + "grad_norm": 2.0298082562089945, + "language_loss": 0.74783927, + "learning_rate": 3.957218878401037e-06, + "loss": 0.76847005, + "num_input_tokens_seen": 16699375, + "step": 782, + "time_per_iteration": 3.619910717010498 + }, + { + "auxiliary_loss_clip": 0.01211424, + "auxiliary_loss_mlp": 0.01095673, + "balance_loss_clip": 1.0577811, + "balance_loss_mlp": 1.01093888, + "epoch": 0.09415018337040823, + "flos": 29420463041280.0, + "grad_norm": 1.8048598856409959, + "language_loss": 0.89529818, + "learning_rate": 3.957058474513246e-06, + "loss": 0.91836917, + "num_input_tokens_seen": 16719230, + "step": 783, + "time_per_iteration": 2.706242561340332 + }, + { + "auxiliary_loss_clip": 0.01197415, + "auxiliary_loss_mlp": 0.01092842, + "balance_loss_clip": 1.05448794, + "balance_loss_mlp": 1.00867987, + "epoch": 0.09427042626104731, + "flos": 24572092141440.0, + "grad_norm": 1.8020174437239025, + "language_loss": 0.78501678, + "learning_rate": 3.956897773741241e-06, + "loss": 0.80791932, + "num_input_tokens_seen": 16738220, + "step": 784, + "time_per_iteration": 2.8626105785369873 + }, + { + "auxiliary_loss_clip": 0.01181605, + "auxiliary_loss_mlp": 0.01096024, + "balance_loss_clip": 1.04815137, + "balance_loss_mlp": 1.0116235, + "epoch": 0.09439066915168641, + "flos": 26359581576960.0, + "grad_norm": 1.645986466169863, + "language_loss": 0.71698344, + "learning_rate": 3.956736776109398e-06, + "loss": 0.73975968, + "num_input_tokens_seen": 16759395, + "step": 785, + "time_per_iteration": 2.734646797180176 + }, + { + "auxiliary_loss_clip": 0.0120087, + "auxiliary_loss_mlp": 0.00875502, + "balance_loss_clip": 1.05571032, + "balance_loss_mlp": 1.00016427, + "epoch": 0.09451091204232549, + "flos": 19427278296960.0, + "grad_norm": 2.9272201424897344, + "language_loss": 0.83602536, + "learning_rate": 3.956575481642143e-06, + "loss": 0.85678911, + "num_input_tokens_seen": 16778285, + "step": 786, + "time_per_iteration": 2.7011265754699707 + }, + { + "auxiliary_loss_clip": 0.01155228, + "auxiliary_loss_mlp": 0.0109439, + "balance_loss_clip": 1.04974818, + "balance_loss_mlp": 1.00979924, + "epoch": 0.09463115493296459, + "flos": 25368051571200.0, + "grad_norm": 2.3503884766429457, + "language_loss": 0.75322145, + "learning_rate": 3.956413890363943e-06, + "loss": 0.77571762, + "num_input_tokens_seen": 16795265, + "step": 787, + "time_per_iteration": 2.8261168003082275 + }, + { + "auxiliary_loss_clip": 0.01189487, + "auxiliary_loss_mlp": 0.01094014, + "balance_loss_clip": 1.05186069, + "balance_loss_mlp": 1.00970936, + "epoch": 0.09475139782360369, + "flos": 10123254869760.0, + "grad_norm": 2.1682146008102405, + "language_loss": 0.8148551, + "learning_rate": 3.956252002299312e-06, + "loss": 0.83769011, + "num_input_tokens_seen": 16811165, + "step": 788, + "time_per_iteration": 2.67716646194458 + }, + { + "auxiliary_loss_clip": 0.01208487, + "auxiliary_loss_mlp": 0.01095278, + "balance_loss_clip": 1.05559933, + "balance_loss_mlp": 1.01087785, + "epoch": 0.09487164071424277, + "flos": 17231088936960.0, + "grad_norm": 3.6717936280397034, + "language_loss": 0.90832007, + "learning_rate": 3.956089817472807e-06, + "loss": 0.93135774, + "num_input_tokens_seen": 16828470, + "step": 789, + "time_per_iteration": 2.644118070602417 + }, + { + "auxiliary_loss_clip": 0.01181949, + "auxiliary_loss_mlp": 0.0109499, + "balance_loss_clip": 1.05035591, + "balance_loss_mlp": 1.01044655, + "epoch": 0.09499188360488187, + "flos": 30849564528000.0, + "grad_norm": 2.094464386399811, + "language_loss": 0.85646039, + "learning_rate": 3.955927335909032e-06, + "loss": 0.87922978, + "num_input_tokens_seen": 16851680, + "step": 790, + "time_per_iteration": 2.828059673309326 + }, + { + "auxiliary_loss_clip": 0.01158324, + "auxiliary_loss_mlp": 0.01092346, + "balance_loss_clip": 1.04605937, + "balance_loss_mlp": 1.00813699, + "epoch": 0.09511212649552095, + "flos": 29351694453120.0, + "grad_norm": 2.0263707838798, + "language_loss": 0.76096916, + "learning_rate": 3.955764557632634e-06, + "loss": 0.78347588, + "num_input_tokens_seen": 16871490, + "step": 791, + "time_per_iteration": 2.8739426136016846 + }, + { + "auxiliary_loss_clip": 0.01188957, + "auxiliary_loss_mlp": 0.01093218, + "balance_loss_clip": 1.05411553, + "balance_loss_mlp": 1.00872231, + "epoch": 0.09523236938616005, + "flos": 10378687461120.0, + "grad_norm": 3.1736680310756187, + "language_loss": 0.9450841, + "learning_rate": 3.955601482668309e-06, + "loss": 0.96790582, + "num_input_tokens_seen": 16889350, + "step": 792, + "time_per_iteration": 2.7154734134674072 + }, + { + "auxiliary_loss_clip": 0.0117394, + "auxiliary_loss_mlp": 0.01094334, + "balance_loss_clip": 1.0537256, + "balance_loss_mlp": 1.00979078, + "epoch": 0.09535261227679913, + "flos": 19061815368960.0, + "grad_norm": 2.2045750630083423, + "language_loss": 0.88523531, + "learning_rate": 3.955438111040794e-06, + "loss": 0.90791798, + "num_input_tokens_seen": 16907625, + "step": 793, + "time_per_iteration": 2.815078020095825 + }, + { + "auxiliary_loss_clip": 0.01159208, + "auxiliary_loss_mlp": 0.01097365, + "balance_loss_clip": 1.05178297, + "balance_loss_mlp": 1.0126313, + "epoch": 0.09547285516743823, + "flos": 20922993555840.0, + "grad_norm": 1.920315493058942, + "language_loss": 0.79755378, + "learning_rate": 3.955274442774873e-06, + "loss": 0.8201195, + "num_input_tokens_seen": 16926205, + "step": 794, + "time_per_iteration": 2.786180257797241 + }, + { + "auxiliary_loss_clip": 0.01193445, + "auxiliary_loss_mlp": 0.01094416, + "balance_loss_clip": 1.05741799, + "balance_loss_mlp": 1.00958681, + "epoch": 0.09559309805807732, + "flos": 30154405639680.0, + "grad_norm": 2.22921688144805, + "language_loss": 0.70234925, + "learning_rate": 3.9551104778953725e-06, + "loss": 0.72522783, + "num_input_tokens_seen": 16946500, + "step": 795, + "time_per_iteration": 2.8365373611450195 + }, + { + "auxiliary_loss_clip": 0.01177971, + "auxiliary_loss_mlp": 0.01093511, + "balance_loss_clip": 1.05245233, + "balance_loss_mlp": 1.00882483, + "epoch": 0.0957133409487164, + "flos": 21066743784960.0, + "grad_norm": 1.7933814715625855, + "language_loss": 0.85474861, + "learning_rate": 3.954946216427167e-06, + "loss": 0.87746346, + "num_input_tokens_seen": 16966960, + "step": 796, + "time_per_iteration": 2.7908759117126465 + }, + { + "auxiliary_loss_clip": 0.01179369, + "auxiliary_loss_mlp": 0.01082016, + "balance_loss_clip": 1.06441092, + "balance_loss_mlp": 1.00114405, + "epoch": 0.0958335838393555, + "flos": 71297979315840.0, + "grad_norm": 0.8780061061277596, + "language_loss": 0.61544657, + "learning_rate": 3.954781658395176e-06, + "loss": 0.63806039, + "num_input_tokens_seen": 17023215, + "step": 797, + "time_per_iteration": 3.2915563583374023 + }, + { + "auxiliary_loss_clip": 0.01191245, + "auxiliary_loss_mlp": 0.01093827, + "balance_loss_clip": 1.05501342, + "balance_loss_mlp": 1.00904536, + "epoch": 0.09595382672999458, + "flos": 21872974504320.0, + "grad_norm": 3.0465542979685765, + "language_loss": 0.92432392, + "learning_rate": 3.95461680382436e-06, + "loss": 0.94717467, + "num_input_tokens_seen": 17042140, + "step": 798, + "time_per_iteration": 2.7508885860443115 + }, + { + "auxiliary_loss_clip": 0.01199377, + "auxiliary_loss_mlp": 0.01097809, + "balance_loss_clip": 1.0566299, + "balance_loss_mlp": 1.01283693, + "epoch": 0.09607406962063368, + "flos": 18695562341760.0, + "grad_norm": 3.648910142726449, + "language_loss": 0.86597431, + "learning_rate": 3.9544516527397295e-06, + "loss": 0.88894612, + "num_input_tokens_seen": 17058490, + "step": 799, + "time_per_iteration": 2.67826771736145 + }, + { + "auxiliary_loss_clip": 0.01176221, + "auxiliary_loss_mlp": 0.01097823, + "balance_loss_clip": 1.05229247, + "balance_loss_mlp": 1.01299405, + "epoch": 0.09619431251127276, + "flos": 22568456615040.0, + "grad_norm": 1.7613793343895363, + "language_loss": 0.808855, + "learning_rate": 3.954286205166338e-06, + "loss": 0.83159542, + "num_input_tokens_seen": 17079655, + "step": 800, + "time_per_iteration": 2.8132879734039307 + }, + { + "auxiliary_loss_clip": 0.01200899, + "auxiliary_loss_mlp": 0.01098794, + "balance_loss_clip": 1.05825448, + "balance_loss_mlp": 1.01344013, + "epoch": 0.09631455540191186, + "flos": 14246230608000.0, + "grad_norm": 2.5468941630562822, + "language_loss": 0.83815175, + "learning_rate": 3.954120461129282e-06, + "loss": 0.86114872, + "num_input_tokens_seen": 17097065, + "step": 801, + "time_per_iteration": 2.665971040725708 + }, + { + "auxiliary_loss_clip": 0.0121245, + "auxiliary_loss_mlp": 0.01099208, + "balance_loss_clip": 1.05948806, + "balance_loss_mlp": 1.0148077, + "epoch": 0.09643479829255096, + "flos": 20740387789440.0, + "grad_norm": 2.148374023570684, + "language_loss": 0.83822644, + "learning_rate": 3.953954420653706e-06, + "loss": 0.86134303, + "num_input_tokens_seen": 17114090, + "step": 802, + "time_per_iteration": 2.6696839332580566 + }, + { + "auxiliary_loss_clip": 0.01200048, + "auxiliary_loss_mlp": 0.01097281, + "balance_loss_clip": 1.05657506, + "balance_loss_mlp": 1.01283324, + "epoch": 0.09655504118319004, + "flos": 24420476833920.0, + "grad_norm": 1.881758992421834, + "language_loss": 0.87977844, + "learning_rate": 3.953788083764798e-06, + "loss": 0.90275168, + "num_input_tokens_seen": 17133325, + "step": 803, + "time_per_iteration": 2.690701961517334 + }, + { + "auxiliary_loss_clip": 0.01163089, + "auxiliary_loss_mlp": 0.01095376, + "balance_loss_clip": 1.04952216, + "balance_loss_mlp": 1.01083291, + "epoch": 0.09667528407382914, + "flos": 18441961344000.0, + "grad_norm": 4.733486580288493, + "language_loss": 0.92347181, + "learning_rate": 3.953621450487792e-06, + "loss": 0.94605649, + "num_input_tokens_seen": 17151945, + "step": 804, + "time_per_iteration": 3.675278663635254 + }, + { + "auxiliary_loss_clip": 0.01215983, + "auxiliary_loss_mlp": 0.01085418, + "balance_loss_clip": 1.07598376, + "balance_loss_mlp": 1.00416458, + "epoch": 0.09679552696446822, + "flos": 70816455544320.0, + "grad_norm": 0.8433174191242012, + "language_loss": 0.61183697, + "learning_rate": 3.953454520847964e-06, + "loss": 0.63485098, + "num_input_tokens_seen": 17216790, + "step": 805, + "time_per_iteration": 4.227905750274658 + }, + { + "auxiliary_loss_clip": 0.01190539, + "auxiliary_loss_mlp": 0.01097945, + "balance_loss_clip": 1.05580246, + "balance_loss_mlp": 1.01282978, + "epoch": 0.09691576985510732, + "flos": 21945514020480.0, + "grad_norm": 2.34322191643404, + "language_loss": 0.73510039, + "learning_rate": 3.9532872948706395e-06, + "loss": 0.75798523, + "num_input_tokens_seen": 17236285, + "step": 806, + "time_per_iteration": 3.766140937805176 + }, + { + "auxiliary_loss_clip": 0.01185203, + "auxiliary_loss_mlp": 0.01093633, + "balance_loss_clip": 1.0535841, + "balance_loss_mlp": 1.00842273, + "epoch": 0.09703601274574641, + "flos": 17965211103360.0, + "grad_norm": 2.501800437141934, + "language_loss": 0.82803339, + "learning_rate": 3.9531197725811845e-06, + "loss": 0.85082173, + "num_input_tokens_seen": 17251670, + "step": 807, + "time_per_iteration": 2.6649482250213623 + }, + { + "auxiliary_loss_clip": 0.01213128, + "auxiliary_loss_mlp": 0.01097067, + "balance_loss_clip": 1.06085944, + "balance_loss_mlp": 1.01252413, + "epoch": 0.0971562556363855, + "flos": 22162162901760.0, + "grad_norm": 2.368420272162879, + "language_loss": 0.87826014, + "learning_rate": 3.952951954005013e-06, + "loss": 0.90136212, + "num_input_tokens_seen": 17271355, + "step": 808, + "time_per_iteration": 3.6295199394226074 + }, + { + "auxiliary_loss_clip": 0.01184161, + "auxiliary_loss_mlp": 0.01094168, + "balance_loss_clip": 1.05560589, + "balance_loss_mlp": 1.00967216, + "epoch": 0.0972764985270246, + "flos": 25848716394240.0, + "grad_norm": 2.0281528520786796, + "language_loss": 0.84533966, + "learning_rate": 3.952783839167584e-06, + "loss": 0.86812299, + "num_input_tokens_seen": 17291400, + "step": 809, + "time_per_iteration": 2.818639039993286 + }, + { + "auxiliary_loss_clip": 0.01195481, + "auxiliary_loss_mlp": 0.01098639, + "balance_loss_clip": 1.0540297, + "balance_loss_mlp": 1.0138092, + "epoch": 0.09739674141766368, + "flos": 20339373375360.0, + "grad_norm": 3.315337163386557, + "language_loss": 0.74551678, + "learning_rate": 3.952615428094398e-06, + "loss": 0.76845789, + "num_input_tokens_seen": 17310920, + "step": 810, + "time_per_iteration": 2.677090644836426 + }, + { + "auxiliary_loss_clip": 0.01165263, + "auxiliary_loss_mlp": 0.01097029, + "balance_loss_clip": 1.05313981, + "balance_loss_mlp": 1.01210427, + "epoch": 0.09751698430830277, + "flos": 15743059188480.0, + "grad_norm": 1.8744597988748046, + "language_loss": 0.73574829, + "learning_rate": 3.952446720811004e-06, + "loss": 0.75837123, + "num_input_tokens_seen": 17329245, + "step": 811, + "time_per_iteration": 2.7767038345336914 + }, + { + "auxiliary_loss_clip": 0.01165436, + "auxiliary_loss_mlp": 0.01085454, + "balance_loss_clip": 1.05792177, + "balance_loss_mlp": 1.00420058, + "epoch": 0.09763722719894186, + "flos": 63716806800000.0, + "grad_norm": 0.8354600248866421, + "language_loss": 0.63659048, + "learning_rate": 3.952277717342995e-06, + "loss": 0.6590994, + "num_input_tokens_seen": 17395680, + "step": 812, + "time_per_iteration": 3.4346120357513428 + }, + { + "auxiliary_loss_clip": 0.01176828, + "auxiliary_loss_mlp": 0.01099983, + "balance_loss_clip": 1.04985189, + "balance_loss_mlp": 1.01491511, + "epoch": 0.09775747008958095, + "flos": 22090916275200.0, + "grad_norm": 2.704600891784711, + "language_loss": 0.85283017, + "learning_rate": 3.952108417716009e-06, + "loss": 0.87559819, + "num_input_tokens_seen": 17415135, + "step": 813, + "time_per_iteration": 2.7267932891845703 + }, + { + "auxiliary_loss_clip": 0.01207098, + "auxiliary_loss_mlp": 0.01096679, + "balance_loss_clip": 1.06219196, + "balance_loss_mlp": 1.01165938, + "epoch": 0.09787771298022005, + "flos": 21286050272640.0, + "grad_norm": 2.1629394710292935, + "language_loss": 0.84717941, + "learning_rate": 3.951938821955727e-06, + "loss": 0.8702172, + "num_input_tokens_seen": 17434535, + "step": 814, + "time_per_iteration": 2.7022080421447754 + }, + { + "auxiliary_loss_clip": 0.01188464, + "auxiliary_loss_mlp": 0.01093908, + "balance_loss_clip": 1.05404592, + "balance_loss_mlp": 1.00912678, + "epoch": 0.09799795587085913, + "flos": 22054574689920.0, + "grad_norm": 2.174554649724255, + "language_loss": 0.76627833, + "learning_rate": 3.9517689300878786e-06, + "loss": 0.78910202, + "num_input_tokens_seen": 17454270, + "step": 815, + "time_per_iteration": 2.726736068725586 + }, + { + "auxiliary_loss_clip": 0.01209701, + "auxiliary_loss_mlp": 0.01097549, + "balance_loss_clip": 1.05695844, + "balance_loss_mlp": 1.01267242, + "epoch": 0.09811819876149823, + "flos": 22163743100160.0, + "grad_norm": 1.8390769315111584, + "language_loss": 0.78602254, + "learning_rate": 3.951598742138236e-06, + "loss": 0.80909503, + "num_input_tokens_seen": 17472995, + "step": 816, + "time_per_iteration": 2.716655969619751 + }, + { + "auxiliary_loss_clip": 0.0119192, + "auxiliary_loss_mlp": 0.01097422, + "balance_loss_clip": 1.05539954, + "balance_loss_mlp": 1.01249719, + "epoch": 0.09823844165213731, + "flos": 22231111057920.0, + "grad_norm": 1.9117255563290392, + "language_loss": 0.79940248, + "learning_rate": 3.951428258132615e-06, + "loss": 0.82229596, + "num_input_tokens_seen": 17491115, + "step": 817, + "time_per_iteration": 2.708635091781616 + }, + { + "auxiliary_loss_clip": 0.01180628, + "auxiliary_loss_mlp": 0.01096975, + "balance_loss_clip": 1.04760754, + "balance_loss_mlp": 1.01219308, + "epoch": 0.09835868454277641, + "flos": 22487728798080.0, + "grad_norm": 1.8755079684750986, + "language_loss": 0.8471539, + "learning_rate": 3.951257478096879e-06, + "loss": 0.86992991, + "num_input_tokens_seen": 17509480, + "step": 818, + "time_per_iteration": 2.7594659328460693 + }, + { + "auxiliary_loss_clip": 0.01186512, + "auxiliary_loss_mlp": 0.0087559, + "balance_loss_clip": 1.05259526, + "balance_loss_mlp": 1.00024056, + "epoch": 0.0984789274334155, + "flos": 16362554077440.0, + "grad_norm": 3.031374566202843, + "language_loss": 0.68241775, + "learning_rate": 3.951086402056936e-06, + "loss": 0.70303875, + "num_input_tokens_seen": 17524080, + "step": 819, + "time_per_iteration": 2.739668846130371 + }, + { + "auxiliary_loss_clip": 0.01130176, + "auxiliary_loss_mlp": 0.0087552, + "balance_loss_clip": 1.04162705, + "balance_loss_mlp": 1.00027466, + "epoch": 0.09859917032405459, + "flos": 24243545416320.0, + "grad_norm": 1.8234853443674224, + "language_loss": 0.83808422, + "learning_rate": 3.950915030038735e-06, + "loss": 0.85814118, + "num_input_tokens_seen": 17543875, + "step": 820, + "time_per_iteration": 2.9431092739105225 + }, + { + "auxiliary_loss_clip": 0.01193988, + "auxiliary_loss_mlp": 0.01095498, + "balance_loss_clip": 1.05241847, + "balance_loss_mlp": 1.01066899, + "epoch": 0.09871941321469369, + "flos": 17420195064960.0, + "grad_norm": 2.154050475549367, + "language_loss": 0.83636534, + "learning_rate": 3.9507433620682765e-06, + "loss": 0.85926026, + "num_input_tokens_seen": 17560810, + "step": 821, + "time_per_iteration": 2.7230658531188965 + }, + { + "auxiliary_loss_clip": 0.01168397, + "auxiliary_loss_mlp": 0.01096555, + "balance_loss_clip": 1.05087042, + "balance_loss_mlp": 1.01191664, + "epoch": 0.09883965610533277, + "flos": 28477341590400.0, + "grad_norm": 1.8452247340468708, + "language_loss": 0.882599, + "learning_rate": 3.9505713981716e-06, + "loss": 0.90524852, + "num_input_tokens_seen": 17583640, + "step": 822, + "time_per_iteration": 2.98162579536438 + }, + { + "auxiliary_loss_clip": 0.01185322, + "auxiliary_loss_mlp": 0.01097088, + "balance_loss_clip": 1.05330205, + "balance_loss_mlp": 1.01240134, + "epoch": 0.09895989899597187, + "flos": 23693932437120.0, + "grad_norm": 2.039351526034984, + "language_loss": 0.81176728, + "learning_rate": 3.950399138374795e-06, + "loss": 0.83459139, + "num_input_tokens_seen": 17602720, + "step": 823, + "time_per_iteration": 2.7134010791778564 + }, + { + "auxiliary_loss_clip": 0.01196056, + "auxiliary_loss_mlp": 0.01094347, + "balance_loss_clip": 1.05311787, + "balance_loss_mlp": 1.00951767, + "epoch": 0.09908014188661095, + "flos": 24679608526080.0, + "grad_norm": 1.9047603556063268, + "language_loss": 0.74202812, + "learning_rate": 3.95022658270399e-06, + "loss": 0.76493216, + "num_input_tokens_seen": 17623085, + "step": 824, + "time_per_iteration": 2.775298595428467 + }, + { + "auxiliary_loss_clip": 0.01184279, + "auxiliary_loss_mlp": 0.01097204, + "balance_loss_clip": 1.05172729, + "balance_loss_mlp": 1.01227927, + "epoch": 0.09920038477725004, + "flos": 14064307200000.0, + "grad_norm": 2.622932899499701, + "language_loss": 0.77975452, + "learning_rate": 3.9500537311853635e-06, + "loss": 0.80256933, + "num_input_tokens_seen": 17641040, + "step": 825, + "time_per_iteration": 2.6820712089538574 + }, + { + "auxiliary_loss_clip": 0.01191282, + "auxiliary_loss_mlp": 0.01097606, + "balance_loss_clip": 1.05484271, + "balance_loss_mlp": 1.01296759, + "epoch": 0.09932062766788914, + "flos": 13407070095360.0, + "grad_norm": 2.507685794859657, + "language_loss": 0.83450186, + "learning_rate": 3.949880583845136e-06, + "loss": 0.8573907, + "num_input_tokens_seen": 17659115, + "step": 826, + "time_per_iteration": 2.6792945861816406 + }, + { + "auxiliary_loss_clip": 0.01193023, + "auxiliary_loss_mlp": 0.01097202, + "balance_loss_clip": 1.05670178, + "balance_loss_mlp": 1.01251531, + "epoch": 0.09944087055852822, + "flos": 19500751566720.0, + "grad_norm": 3.7221121206188266, + "language_loss": 0.81200206, + "learning_rate": 3.949707140709575e-06, + "loss": 0.83490425, + "num_input_tokens_seen": 17678845, + "step": 827, + "time_per_iteration": 2.801515817642212 + }, + { + "auxiliary_loss_clip": 0.01199053, + "auxiliary_loss_mlp": 0.01095369, + "balance_loss_clip": 1.05476713, + "balance_loss_mlp": 1.01058745, + "epoch": 0.09956111344916732, + "flos": 17749100926080.0, + "grad_norm": 2.765095335026804, + "language_loss": 0.83603638, + "learning_rate": 3.949533401804991e-06, + "loss": 0.8589806, + "num_input_tokens_seen": 17695750, + "step": 828, + "time_per_iteration": 2.6805408000946045 + }, + { + "auxiliary_loss_clip": 0.01196463, + "auxiliary_loss_mlp": 0.0087559, + "balance_loss_clip": 1.05403423, + "balance_loss_mlp": 1.00021625, + "epoch": 0.0996813563398064, + "flos": 17967581400960.0, + "grad_norm": 1.9044519823932684, + "language_loss": 0.90474725, + "learning_rate": 3.949359367157739e-06, + "loss": 0.92546773, + "num_input_tokens_seen": 17714445, + "step": 829, + "time_per_iteration": 2.6966912746429443 + }, + { + "auxiliary_loss_clip": 0.01197844, + "auxiliary_loss_mlp": 0.01095827, + "balance_loss_clip": 1.05341768, + "balance_loss_mlp": 1.01080704, + "epoch": 0.0998015992304455, + "flos": 17457039440640.0, + "grad_norm": 7.2565882984713435, + "language_loss": 0.75849652, + "learning_rate": 3.949185036794222e-06, + "loss": 0.78143317, + "num_input_tokens_seen": 17732455, + "step": 830, + "time_per_iteration": 4.507457971572876 + }, + { + "auxiliary_loss_clip": 0.01209395, + "auxiliary_loss_mlp": 0.01096914, + "balance_loss_clip": 1.05697787, + "balance_loss_mlp": 1.01222777, + "epoch": 0.0999218421210846, + "flos": 25888757080320.0, + "grad_norm": 1.745294490861668, + "language_loss": 0.78477263, + "learning_rate": 3.949010410740884e-06, + "loss": 0.8078357, + "num_input_tokens_seen": 17755280, + "step": 831, + "time_per_iteration": 3.6295650005340576 + }, + { + "auxiliary_loss_clip": 0.01182712, + "auxiliary_loss_mlp": 0.00875576, + "balance_loss_clip": 1.05088258, + "balance_loss_mlp": 1.00020194, + "epoch": 0.10004208501172368, + "flos": 21215916967680.0, + "grad_norm": 1.8073300414651114, + "language_loss": 0.86316758, + "learning_rate": 3.948835489024216e-06, + "loss": 0.88375044, + "num_input_tokens_seen": 17775015, + "step": 832, + "time_per_iteration": 2.758394241333008 + }, + { + "auxiliary_loss_clip": 0.01200053, + "auxiliary_loss_mlp": 0.01097852, + "balance_loss_clip": 1.05600417, + "balance_loss_mlp": 1.01297498, + "epoch": 0.10016232790236278, + "flos": 17348409734400.0, + "grad_norm": 2.2117359392182108, + "language_loss": 0.90205222, + "learning_rate": 3.948660271670755e-06, + "loss": 0.92503124, + "num_input_tokens_seen": 17792165, + "step": 833, + "time_per_iteration": 3.580321788787842 + }, + { + "auxiliary_loss_clip": 0.01178755, + "auxiliary_loss_mlp": 0.0109566, + "balance_loss_clip": 1.04948652, + "balance_loss_mlp": 1.01073575, + "epoch": 0.10028257079300186, + "flos": 25666541591040.0, + "grad_norm": 2.0808453409815226, + "language_loss": 0.842237, + "learning_rate": 3.948484758707079e-06, + "loss": 0.86498117, + "num_input_tokens_seen": 17811765, + "step": 834, + "time_per_iteration": 2.7545883655548096 + }, + { + "auxiliary_loss_clip": 0.01174556, + "auxiliary_loss_mlp": 0.01097873, + "balance_loss_clip": 1.04876208, + "balance_loss_mlp": 1.01270986, + "epoch": 0.10040281368364096, + "flos": 25156035544320.0, + "grad_norm": 2.046228549978487, + "language_loss": 0.83582187, + "learning_rate": 3.948308950159815e-06, + "loss": 0.85854614, + "num_input_tokens_seen": 17830445, + "step": 835, + "time_per_iteration": 2.8086912631988525 + }, + { + "auxiliary_loss_clip": 0.01180028, + "auxiliary_loss_mlp": 0.01097514, + "balance_loss_clip": 1.05217922, + "balance_loss_mlp": 1.01244617, + "epoch": 0.10052305657428004, + "flos": 17603303621760.0, + "grad_norm": 2.304213594321286, + "language_loss": 0.75969458, + "learning_rate": 3.9481328460556326e-06, + "loss": 0.78246999, + "num_input_tokens_seen": 17847665, + "step": 836, + "time_per_iteration": 2.7725560665130615 + }, + { + "auxiliary_loss_clip": 0.01189921, + "auxiliary_loss_mlp": 0.01093109, + "balance_loss_clip": 1.05451262, + "balance_loss_mlp": 1.00837505, + "epoch": 0.10064329946491914, + "flos": 18660154510080.0, + "grad_norm": 2.7289301034483757, + "language_loss": 0.89445758, + "learning_rate": 3.9479564464212455e-06, + "loss": 0.91728789, + "num_input_tokens_seen": 17866825, + "step": 837, + "time_per_iteration": 2.744149684906006 + }, + { + "auxiliary_loss_clip": 0.01207169, + "auxiliary_loss_mlp": 0.01097727, + "balance_loss_clip": 1.05366039, + "balance_loss_mlp": 1.0130403, + "epoch": 0.10076354235555823, + "flos": 17199056983680.0, + "grad_norm": 2.210507428518584, + "language_loss": 0.76374567, + "learning_rate": 3.947779751283414e-06, + "loss": 0.78679466, + "num_input_tokens_seen": 17883995, + "step": 838, + "time_per_iteration": 2.6239118576049805 + }, + { + "auxiliary_loss_clip": 0.01193334, + "auxiliary_loss_mlp": 0.00875623, + "balance_loss_clip": 1.05304885, + "balance_loss_mlp": 1.0002389, + "epoch": 0.10088378524619732, + "flos": 22962252395520.0, + "grad_norm": 1.750433929225647, + "language_loss": 0.76042783, + "learning_rate": 3.947602760668944e-06, + "loss": 0.78111744, + "num_input_tokens_seen": 17903785, + "step": 839, + "time_per_iteration": 2.7066731452941895 + }, + { + "auxiliary_loss_clip": 0.01197292, + "auxiliary_loss_mlp": 0.01099764, + "balance_loss_clip": 1.05525255, + "balance_loss_mlp": 1.01488662, + "epoch": 0.10100402813683641, + "flos": 37885828746240.0, + "grad_norm": 1.7266657615384635, + "language_loss": 0.71419036, + "learning_rate": 3.947425474604684e-06, + "loss": 0.73716092, + "num_input_tokens_seen": 17927720, + "step": 840, + "time_per_iteration": 2.7959485054016113 + }, + { + "auxiliary_loss_clip": 0.01186085, + "auxiliary_loss_mlp": 0.01094163, + "balance_loss_clip": 1.05245638, + "balance_loss_mlp": 1.00966752, + "epoch": 0.1011242710274755, + "flos": 21543458112000.0, + "grad_norm": 2.322100884091662, + "language_loss": 0.92591721, + "learning_rate": 3.947247893117528e-06, + "loss": 0.94871968, + "num_input_tokens_seen": 17946225, + "step": 841, + "time_per_iteration": 2.7200450897216797 + }, + { + "auxiliary_loss_clip": 0.01199435, + "auxiliary_loss_mlp": 0.01096781, + "balance_loss_clip": 1.05425906, + "balance_loss_mlp": 1.01142728, + "epoch": 0.10124451391811459, + "flos": 13621456419840.0, + "grad_norm": 3.319803043199276, + "language_loss": 0.69609338, + "learning_rate": 3.947070016234413e-06, + "loss": 0.71905553, + "num_input_tokens_seen": 17962015, + "step": 842, + "time_per_iteration": 2.6274006366729736 + }, + { + "auxiliary_loss_clip": 0.01187656, + "auxiliary_loss_mlp": 0.01097919, + "balance_loss_clip": 1.05250657, + "balance_loss_mlp": 1.01280379, + "epoch": 0.10136475680875369, + "flos": 16649228522880.0, + "grad_norm": 2.2850463489959334, + "language_loss": 0.74895412, + "learning_rate": 3.946891843982326e-06, + "loss": 0.77180988, + "num_input_tokens_seen": 17979680, + "step": 843, + "time_per_iteration": 2.7148685455322266 + }, + { + "auxiliary_loss_clip": 0.01196305, + "auxiliary_loss_mlp": 0.01098107, + "balance_loss_clip": 1.05445004, + "balance_loss_mlp": 1.01313448, + "epoch": 0.10148499969939277, + "flos": 19461034103040.0, + "grad_norm": 2.4184842258562327, + "language_loss": 0.74509245, + "learning_rate": 3.9467133763882935e-06, + "loss": 0.76803648, + "num_input_tokens_seen": 17998145, + "step": 844, + "time_per_iteration": 2.672520399093628 + }, + { + "auxiliary_loss_clip": 0.01191787, + "auxiliary_loss_mlp": 0.01101732, + "balance_loss_clip": 1.05645537, + "balance_loss_mlp": 1.01699853, + "epoch": 0.10160524259003187, + "flos": 21104988791040.0, + "grad_norm": 2.0318275385503375, + "language_loss": 0.86144137, + "learning_rate": 3.9465346134793905e-06, + "loss": 0.88437653, + "num_input_tokens_seen": 18017955, + "step": 845, + "time_per_iteration": 2.657498359680176 + }, + { + "auxiliary_loss_clip": 0.01172843, + "auxiliary_loss_mlp": 0.01095352, + "balance_loss_clip": 1.04938316, + "balance_loss_mlp": 1.01071286, + "epoch": 0.10172548548067095, + "flos": 17712687513600.0, + "grad_norm": 3.1419736067125195, + "language_loss": 0.79702282, + "learning_rate": 3.9463555552827335e-06, + "loss": 0.81970477, + "num_input_tokens_seen": 18035125, + "step": 846, + "time_per_iteration": 2.7488932609558105 + }, + { + "auxiliary_loss_clip": 0.01192009, + "auxiliary_loss_mlp": 0.01097075, + "balance_loss_clip": 1.0561285, + "balance_loss_mlp": 1.01234126, + "epoch": 0.10184572837131005, + "flos": 21104845136640.0, + "grad_norm": 2.7617417002100773, + "language_loss": 0.86567533, + "learning_rate": 3.946176201825487e-06, + "loss": 0.88856614, + "num_input_tokens_seen": 18053160, + "step": 847, + "time_per_iteration": 2.678567886352539 + }, + { + "auxiliary_loss_clip": 0.01186848, + "auxiliary_loss_mlp": 0.01095537, + "balance_loss_clip": 1.0538274, + "balance_loss_mlp": 1.01066053, + "epoch": 0.10196597126194913, + "flos": 26067591918720.0, + "grad_norm": 1.895725954310029, + "language_loss": 0.8341822, + "learning_rate": 3.9459965531348575e-06, + "loss": 0.85700607, + "num_input_tokens_seen": 18072815, + "step": 848, + "time_per_iteration": 2.8695292472839355 + }, + { + "auxiliary_loss_clip": 0.0118389, + "auxiliary_loss_mlp": 0.00875598, + "balance_loss_clip": 1.0506233, + "balance_loss_mlp": 1.0003159, + "epoch": 0.10208621415258823, + "flos": 29314634595840.0, + "grad_norm": 2.1364149206738072, + "language_loss": 0.85671914, + "learning_rate": 3.945816609238098e-06, + "loss": 0.87731403, + "num_input_tokens_seen": 18092225, + "step": 849, + "time_per_iteration": 2.844052314758301 + }, + { + "auxiliary_loss_clip": 0.01154312, + "auxiliary_loss_mlp": 0.01098306, + "balance_loss_clip": 1.04681563, + "balance_loss_mlp": 1.01328564, + "epoch": 0.10220645704322733, + "flos": 23805794367360.0, + "grad_norm": 1.9595608045444854, + "language_loss": 0.85240966, + "learning_rate": 3.945636370162507e-06, + "loss": 0.87493587, + "num_input_tokens_seen": 18112335, + "step": 850, + "time_per_iteration": 2.84367036819458 + }, + { + "auxiliary_loss_clip": 0.01198034, + "auxiliary_loss_mlp": 0.01093326, + "balance_loss_clip": 1.05475879, + "balance_loss_mlp": 1.00916433, + "epoch": 0.10232669993386641, + "flos": 23218546913280.0, + "grad_norm": 1.780974871372841, + "language_loss": 0.79229695, + "learning_rate": 3.945455835935425e-06, + "loss": 0.81521058, + "num_input_tokens_seen": 18131520, + "step": 851, + "time_per_iteration": 2.6956937313079834 + }, + { + "auxiliary_loss_clip": 0.01185547, + "auxiliary_loss_mlp": 0.01095363, + "balance_loss_clip": 1.05144858, + "balance_loss_mlp": 1.01072454, + "epoch": 0.1024469428245055, + "flos": 22922929981440.0, + "grad_norm": 2.9691372901682267, + "language_loss": 0.75247931, + "learning_rate": 3.94527500658424e-06, + "loss": 0.77528846, + "num_input_tokens_seen": 18149185, + "step": 852, + "time_per_iteration": 2.747331142425537 + }, + { + "auxiliary_loss_clip": 0.0116493, + "auxiliary_loss_mlp": 0.01096308, + "balance_loss_clip": 1.04994345, + "balance_loss_mlp": 1.01200318, + "epoch": 0.10256718571514459, + "flos": 31359495957120.0, + "grad_norm": 2.3622600424573723, + "language_loss": 0.81387186, + "learning_rate": 3.945093882136382e-06, + "loss": 0.83648419, + "num_input_tokens_seen": 18172960, + "step": 853, + "time_per_iteration": 2.925464630126953 + }, + { + "auxiliary_loss_clip": 0.01173518, + "auxiliary_loss_mlp": 0.00875537, + "balance_loss_clip": 1.04544258, + "balance_loss_mlp": 1.00032496, + "epoch": 0.10268742860578368, + "flos": 23474877344640.0, + "grad_norm": 2.045114706735104, + "language_loss": 0.84829545, + "learning_rate": 3.944912462619329e-06, + "loss": 0.86878598, + "num_input_tokens_seen": 18191925, + "step": 854, + "time_per_iteration": 2.774251937866211 + }, + { + "auxiliary_loss_clip": 0.01184229, + "auxiliary_loss_mlp": 0.01097502, + "balance_loss_clip": 1.04846132, + "balance_loss_mlp": 1.01210046, + "epoch": 0.10280767149642277, + "flos": 25520313323520.0, + "grad_norm": 1.8301904653318055, + "language_loss": 0.80597609, + "learning_rate": 3.9447307480606025e-06, + "loss": 0.82879335, + "num_input_tokens_seen": 18212010, + "step": 855, + "time_per_iteration": 2.756497859954834 + }, + { + "auxiliary_loss_clip": 0.01185461, + "auxiliary_loss_mlp": 0.01096001, + "balance_loss_clip": 1.05197072, + "balance_loss_mlp": 1.01136231, + "epoch": 0.10292791438706186, + "flos": 17347691462400.0, + "grad_norm": 2.1981063674846046, + "language_loss": 0.90064657, + "learning_rate": 3.944548738487767e-06, + "loss": 0.9234612, + "num_input_tokens_seen": 18229525, + "step": 856, + "time_per_iteration": 4.633031368255615 + }, + { + "auxiliary_loss_clip": 0.01210143, + "auxiliary_loss_mlp": 0.01098087, + "balance_loss_clip": 1.05771351, + "balance_loss_mlp": 1.01273334, + "epoch": 0.10304815727770096, + "flos": 27052693390080.0, + "grad_norm": 1.8352237990320914, + "language_loss": 0.90729105, + "learning_rate": 3.944366433928434e-06, + "loss": 0.93037343, + "num_input_tokens_seen": 18249505, + "step": 857, + "time_per_iteration": 3.7044713497161865 + }, + { + "auxiliary_loss_clip": 0.01189555, + "auxiliary_loss_mlp": 0.01098608, + "balance_loss_clip": 1.05342412, + "balance_loss_mlp": 1.01430285, + "epoch": 0.10316840016834004, + "flos": 22782591544320.0, + "grad_norm": 1.4385503224615352, + "language_loss": 0.83626902, + "learning_rate": 3.9441838344102594e-06, + "loss": 0.85915065, + "num_input_tokens_seen": 18269230, + "step": 858, + "time_per_iteration": 2.789588451385498 + }, + { + "auxiliary_loss_clip": 0.01188486, + "auxiliary_loss_mlp": 0.01097162, + "balance_loss_clip": 1.05183935, + "balance_loss_mlp": 1.01233292, + "epoch": 0.10328864305897914, + "flos": 20704584908160.0, + "grad_norm": 2.4269154836574724, + "language_loss": 0.67459399, + "learning_rate": 3.944000939960943e-06, + "loss": 0.69745052, + "num_input_tokens_seen": 18287955, + "step": 859, + "time_per_iteration": 2.736233949661255 + }, + { + "auxiliary_loss_clip": 0.01198235, + "auxiliary_loss_mlp": 0.01097084, + "balance_loss_clip": 1.05348313, + "balance_loss_mlp": 1.01230216, + "epoch": 0.10340888594961822, + "flos": 28478814048000.0, + "grad_norm": 1.5038210339738072, + "language_loss": 0.79833472, + "learning_rate": 3.943817750608229e-06, + "loss": 0.82128787, + "num_input_tokens_seen": 18310505, + "step": 860, + "time_per_iteration": 3.721118211746216 + }, + { + "auxiliary_loss_clip": 0.01196937, + "auxiliary_loss_mlp": 0.0109492, + "balance_loss_clip": 1.05333352, + "balance_loss_mlp": 1.01071048, + "epoch": 0.10352912884025732, + "flos": 13370333460480.0, + "grad_norm": 2.206153758703762, + "language_loss": 0.82060587, + "learning_rate": 3.943634266379908e-06, + "loss": 0.84352446, + "num_input_tokens_seen": 18327400, + "step": 861, + "time_per_iteration": 2.7143633365631104 + }, + { + "auxiliary_loss_clip": 0.01197918, + "auxiliary_loss_mlp": 0.01095857, + "balance_loss_clip": 1.05342221, + "balance_loss_mlp": 1.01155245, + "epoch": 0.10364937173089642, + "flos": 25558558329600.0, + "grad_norm": 1.8392029815385145, + "language_loss": 0.85085166, + "learning_rate": 3.943450487303815e-06, + "loss": 0.87378943, + "num_input_tokens_seen": 18347895, + "step": 862, + "time_per_iteration": 2.848884344100952 + }, + { + "auxiliary_loss_clip": 0.01189766, + "auxiliary_loss_mlp": 0.01095259, + "balance_loss_clip": 1.0491401, + "balance_loss_mlp": 1.0104773, + "epoch": 0.1037696146215355, + "flos": 21215486004480.0, + "grad_norm": 2.135534983487173, + "language_loss": 0.85121518, + "learning_rate": 3.943266413407827e-06, + "loss": 0.87406546, + "num_input_tokens_seen": 18367170, + "step": 863, + "time_per_iteration": 2.8213021755218506 + }, + { + "auxiliary_loss_clip": 0.01190463, + "auxiliary_loss_mlp": 0.0109471, + "balance_loss_clip": 1.0474391, + "balance_loss_mlp": 1.01045227, + "epoch": 0.1038898575121746, + "flos": 25807382818560.0, + "grad_norm": 1.783892728279738, + "language_loss": 0.85271037, + "learning_rate": 3.94308204471987e-06, + "loss": 0.87556207, + "num_input_tokens_seen": 18386185, + "step": 864, + "time_per_iteration": 2.7434961795806885 + }, + { + "auxiliary_loss_clip": 0.01175262, + "auxiliary_loss_mlp": 0.01095461, + "balance_loss_clip": 1.05118275, + "balance_loss_mlp": 1.01134658, + "epoch": 0.10401010040281368, + "flos": 19062425900160.0, + "grad_norm": 2.2993286014684515, + "language_loss": 0.74649918, + "learning_rate": 3.942897381267912e-06, + "loss": 0.76920646, + "num_input_tokens_seen": 18402550, + "step": 865, + "time_per_iteration": 2.7490172386169434 + }, + { + "auxiliary_loss_clip": 0.01195477, + "auxiliary_loss_mlp": 0.01098311, + "balance_loss_clip": 1.05293059, + "balance_loss_mlp": 1.01343417, + "epoch": 0.10413034329345278, + "flos": 16355119962240.0, + "grad_norm": 3.947548479822503, + "language_loss": 0.66172278, + "learning_rate": 3.942712423079965e-06, + "loss": 0.68466067, + "num_input_tokens_seen": 18418940, + "step": 866, + "time_per_iteration": 2.723050832748413 + }, + { + "auxiliary_loss_clip": 0.01161924, + "auxiliary_loss_mlp": 0.01093169, + "balance_loss_clip": 1.05067396, + "balance_loss_mlp": 1.00886393, + "epoch": 0.10425058618409186, + "flos": 17236511890560.0, + "grad_norm": 2.369217264604232, + "language_loss": 0.89924765, + "learning_rate": 3.942527170184088e-06, + "loss": 0.92179847, + "num_input_tokens_seen": 18435560, + "step": 867, + "time_per_iteration": 2.762113332748413 + }, + { + "auxiliary_loss_clip": 0.01206356, + "auxiliary_loss_mlp": 0.01093246, + "balance_loss_clip": 1.05400634, + "balance_loss_mlp": 1.00884545, + "epoch": 0.10437082907473096, + "flos": 17967365919360.0, + "grad_norm": 3.0452991968644723, + "language_loss": 0.77445054, + "learning_rate": 3.942341622608385e-06, + "loss": 0.79744655, + "num_input_tokens_seen": 18452590, + "step": 868, + "time_per_iteration": 2.6225452423095703 + }, + { + "auxiliary_loss_clip": 0.01187083, + "auxiliary_loss_mlp": 0.01095838, + "balance_loss_clip": 1.05338001, + "balance_loss_mlp": 1.0112946, + "epoch": 0.10449107196537005, + "flos": 36283315374720.0, + "grad_norm": 1.94022039410819, + "language_loss": 0.77927476, + "learning_rate": 3.942155780381001e-06, + "loss": 0.802104, + "num_input_tokens_seen": 18476325, + "step": 869, + "time_per_iteration": 2.861255168914795 + }, + { + "auxiliary_loss_clip": 0.01187072, + "auxiliary_loss_mlp": 0.01096502, + "balance_loss_clip": 1.05268788, + "balance_loss_mlp": 1.01186359, + "epoch": 0.10461131485600914, + "flos": 23802095266560.0, + "grad_norm": 1.9626967666596284, + "language_loss": 0.76126504, + "learning_rate": 3.94196964353013e-06, + "loss": 0.78410077, + "num_input_tokens_seen": 18495775, + "step": 870, + "time_per_iteration": 2.81258225440979 + }, + { + "auxiliary_loss_clip": 0.01181949, + "auxiliary_loss_mlp": 0.00875432, + "balance_loss_clip": 1.05283165, + "balance_loss_mlp": 1.00033116, + "epoch": 0.10473155774664823, + "flos": 18405476104320.0, + "grad_norm": 1.801367024978777, + "language_loss": 0.80769515, + "learning_rate": 3.941783212084008e-06, + "loss": 0.82826889, + "num_input_tokens_seen": 18513530, + "step": 871, + "time_per_iteration": 2.7315127849578857 + }, + { + "auxiliary_loss_clip": 0.0117341, + "auxiliary_loss_mlp": 0.01098228, + "balance_loss_clip": 1.05032516, + "balance_loss_mlp": 1.01387596, + "epoch": 0.10485180063728732, + "flos": 25592637358080.0, + "grad_norm": 2.4369617836317334, + "language_loss": 0.7894603, + "learning_rate": 3.941596486070916e-06, + "loss": 0.81217664, + "num_input_tokens_seen": 18531575, + "step": 872, + "time_per_iteration": 2.7872862815856934 + }, + { + "auxiliary_loss_clip": 0.01155644, + "auxiliary_loss_mlp": 0.01096642, + "balance_loss_clip": 1.04509115, + "balance_loss_mlp": 1.01224184, + "epoch": 0.10497204352792641, + "flos": 27088747666560.0, + "grad_norm": 6.707432356181925, + "language_loss": 0.58613908, + "learning_rate": 3.941409465519182e-06, + "loss": 0.60866201, + "num_input_tokens_seen": 18552100, + "step": 873, + "time_per_iteration": 2.837873697280884 + }, + { + "auxiliary_loss_clip": 0.0119863, + "auxiliary_loss_mlp": 0.01095745, + "balance_loss_clip": 1.05427194, + "balance_loss_mlp": 1.01139271, + "epoch": 0.10509228641856551, + "flos": 32858479353600.0, + "grad_norm": 2.5032272617658964, + "language_loss": 0.8552013, + "learning_rate": 3.941222150457176e-06, + "loss": 0.8781451, + "num_input_tokens_seen": 18575355, + "step": 874, + "time_per_iteration": 2.843038558959961 + }, + { + "auxiliary_loss_clip": 0.01197656, + "auxiliary_loss_mlp": 0.01092589, + "balance_loss_clip": 1.05301833, + "balance_loss_mlp": 1.00847447, + "epoch": 0.10521252930920459, + "flos": 14319165173760.0, + "grad_norm": 2.7659957112057434, + "language_loss": 0.7152822, + "learning_rate": 3.941034540913311e-06, + "loss": 0.73818469, + "num_input_tokens_seen": 18592885, + "step": 875, + "time_per_iteration": 2.7814040184020996 + }, + { + "auxiliary_loss_clip": 0.01198279, + "auxiliary_loss_mlp": 0.00875552, + "balance_loss_clip": 1.05481815, + "balance_loss_mlp": 1.00035071, + "epoch": 0.10533277219984369, + "flos": 21687028773120.0, + "grad_norm": 1.7653929309672522, + "language_loss": 0.82608283, + "learning_rate": 3.940846636916051e-06, + "loss": 0.84682119, + "num_input_tokens_seen": 18612920, + "step": 876, + "time_per_iteration": 2.7845041751861572 + }, + { + "auxiliary_loss_clip": 0.01183639, + "auxiliary_loss_mlp": 0.01094705, + "balance_loss_clip": 1.05140626, + "balance_loss_mlp": 1.01068652, + "epoch": 0.10545301509048277, + "flos": 22269787027200.0, + "grad_norm": 1.8712167036906793, + "language_loss": 0.86470306, + "learning_rate": 3.940658438493899e-06, + "loss": 0.88748658, + "num_input_tokens_seen": 18630765, + "step": 877, + "time_per_iteration": 2.840050220489502 + }, + { + "auxiliary_loss_clip": 0.0120441, + "auxiliary_loss_mlp": 0.01093726, + "balance_loss_clip": 1.05207133, + "balance_loss_mlp": 1.00923085, + "epoch": 0.10557325798112187, + "flos": 22199725549440.0, + "grad_norm": 5.357374901331287, + "language_loss": 0.75908542, + "learning_rate": 3.940469945675405e-06, + "loss": 0.78206676, + "num_input_tokens_seen": 18649150, + "step": 878, + "time_per_iteration": 2.675497531890869 + }, + { + "auxiliary_loss_clip": 0.01159057, + "auxiliary_loss_mlp": 0.01096798, + "balance_loss_clip": 1.05106854, + "balance_loss_mlp": 1.0126363, + "epoch": 0.10569350087176095, + "flos": 25775889569280.0, + "grad_norm": 3.1292683885935513, + "language_loss": 0.91375709, + "learning_rate": 3.940281158489163e-06, + "loss": 0.9363156, + "num_input_tokens_seen": 18668380, + "step": 879, + "time_per_iteration": 2.9473366737365723 + }, + { + "auxiliary_loss_clip": 0.01140189, + "auxiliary_loss_mlp": 0.01096171, + "balance_loss_clip": 1.04287314, + "balance_loss_mlp": 1.0117712, + "epoch": 0.10581374376240005, + "flos": 17311385790720.0, + "grad_norm": 1.594129756833059, + "language_loss": 0.82993746, + "learning_rate": 3.940092076963812e-06, + "loss": 0.85230106, + "num_input_tokens_seen": 18685875, + "step": 880, + "time_per_iteration": 2.863973379135132 + }, + { + "auxiliary_loss_clip": 0.01192538, + "auxiliary_loss_mlp": 0.01095287, + "balance_loss_clip": 1.05606687, + "balance_loss_mlp": 1.01079154, + "epoch": 0.10593398665303914, + "flos": 34349454017280.0, + "grad_norm": 2.0582484102323386, + "language_loss": 0.78619456, + "learning_rate": 3.9399027011280355e-06, + "loss": 0.80907273, + "num_input_tokens_seen": 18707970, + "step": 881, + "time_per_iteration": 4.697638750076294 + }, + { + "auxiliary_loss_clip": 0.01179028, + "auxiliary_loss_mlp": 0.01095472, + "balance_loss_clip": 1.04986167, + "balance_loss_mlp": 1.01083386, + "epoch": 0.10605422954367823, + "flos": 23257977068160.0, + "grad_norm": 1.8406298864302475, + "language_loss": 0.77324259, + "learning_rate": 3.939713031010561e-06, + "loss": 0.79598755, + "num_input_tokens_seen": 18726335, + "step": 882, + "time_per_iteration": 3.7754366397857666 + }, + { + "auxiliary_loss_clip": 0.01167157, + "auxiliary_loss_mlp": 0.01096508, + "balance_loss_clip": 1.04565382, + "balance_loss_mlp": 1.01205981, + "epoch": 0.10617447243431732, + "flos": 22820118278400.0, + "grad_norm": 1.9865320318093431, + "language_loss": 0.77428603, + "learning_rate": 3.939523066640163e-06, + "loss": 0.79692274, + "num_input_tokens_seen": 18745230, + "step": 883, + "time_per_iteration": 2.825282573699951 + }, + { + "auxiliary_loss_clip": 0.01201114, + "auxiliary_loss_mlp": 0.01095968, + "balance_loss_clip": 1.05673563, + "balance_loss_mlp": 1.01156819, + "epoch": 0.10629471532495641, + "flos": 24386577373440.0, + "grad_norm": 1.844730538426986, + "language_loss": 0.81164795, + "learning_rate": 3.939332808045657e-06, + "loss": 0.83461881, + "num_input_tokens_seen": 18764880, + "step": 884, + "time_per_iteration": 3.6976890563964844 + }, + { + "auxiliary_loss_clip": 0.01172998, + "auxiliary_loss_mlp": 0.01096243, + "balance_loss_clip": 1.04840851, + "balance_loss_mlp": 1.01179528, + "epoch": 0.1064149582155955, + "flos": 21105491581440.0, + "grad_norm": 1.8059900009119048, + "language_loss": 0.84707344, + "learning_rate": 3.939142255255906e-06, + "loss": 0.86976576, + "num_input_tokens_seen": 18785765, + "step": 885, + "time_per_iteration": 2.8200433254241943 + }, + { + "auxiliary_loss_clip": 0.01194711, + "auxiliary_loss_mlp": 0.01094177, + "balance_loss_clip": 1.05219078, + "balance_loss_mlp": 1.01006269, + "epoch": 0.1065352011062346, + "flos": 20702035042560.0, + "grad_norm": 2.391776975181956, + "language_loss": 0.86771357, + "learning_rate": 3.938951408299817e-06, + "loss": 0.89060247, + "num_input_tokens_seen": 18804605, + "step": 886, + "time_per_iteration": 2.72483491897583 + }, + { + "auxiliary_loss_clip": 0.01130599, + "auxiliary_loss_mlp": 0.01081653, + "balance_loss_clip": 1.0368824, + "balance_loss_mlp": 1.00116336, + "epoch": 0.10665544399687368, + "flos": 62659632689280.0, + "grad_norm": 0.799575095995276, + "language_loss": 0.54433984, + "learning_rate": 3.938760267206342e-06, + "loss": 0.5664624, + "num_input_tokens_seen": 18866425, + "step": 887, + "time_per_iteration": 3.2764511108398438 + }, + { + "auxiliary_loss_clip": 0.01205596, + "auxiliary_loss_mlp": 0.01093149, + "balance_loss_clip": 1.05338061, + "balance_loss_mlp": 1.00913, + "epoch": 0.10677568688751278, + "flos": 26140382830080.0, + "grad_norm": 1.9733357594161487, + "language_loss": 0.78311384, + "learning_rate": 3.938568832004475e-06, + "loss": 0.80610132, + "num_input_tokens_seen": 18885130, + "step": 888, + "time_per_iteration": 2.7836647033691406 + }, + { + "auxiliary_loss_clip": 0.0118281, + "auxiliary_loss_mlp": 0.01093777, + "balance_loss_clip": 1.04906344, + "balance_loss_mlp": 1.00932956, + "epoch": 0.10689592977815186, + "flos": 12786533712000.0, + "grad_norm": 1.8432017805707486, + "language_loss": 0.7555244, + "learning_rate": 3.938377102723257e-06, + "loss": 0.77829027, + "num_input_tokens_seen": 18902265, + "step": 889, + "time_per_iteration": 2.9164793491363525 + }, + { + "auxiliary_loss_clip": 0.01156781, + "auxiliary_loss_mlp": 0.0109627, + "balance_loss_clip": 1.04866564, + "balance_loss_mlp": 1.01163125, + "epoch": 0.10701617266879096, + "flos": 22126683242880.0, + "grad_norm": 3.006791440357775, + "language_loss": 0.83500355, + "learning_rate": 3.938185079391774e-06, + "loss": 0.85753405, + "num_input_tokens_seen": 18919310, + "step": 890, + "time_per_iteration": 2.8729207515716553 + }, + { + "auxiliary_loss_clip": 0.01207313, + "auxiliary_loss_mlp": 0.01095233, + "balance_loss_clip": 1.05506611, + "balance_loss_mlp": 1.01088047, + "epoch": 0.10713641555943004, + "flos": 19745625559680.0, + "grad_norm": 2.4751211907755204, + "language_loss": 1.05748677, + "learning_rate": 3.937992762039157e-06, + "loss": 1.08051229, + "num_input_tokens_seen": 18932635, + "step": 891, + "time_per_iteration": 2.865468740463257 + }, + { + "auxiliary_loss_clip": 0.01194147, + "auxiliary_loss_mlp": 0.01097323, + "balance_loss_clip": 1.05275488, + "balance_loss_mlp": 1.01316118, + "epoch": 0.10725665845006914, + "flos": 23952992302080.0, + "grad_norm": 1.5668821525274532, + "language_loss": 0.80301583, + "learning_rate": 3.937800150694577e-06, + "loss": 0.8259306, + "num_input_tokens_seen": 18953810, + "step": 892, + "time_per_iteration": 2.794367790222168 + }, + { + "auxiliary_loss_clip": 0.0115581, + "auxiliary_loss_mlp": 0.01095273, + "balance_loss_clip": 1.04239869, + "balance_loss_mlp": 1.01068258, + "epoch": 0.10737690134070824, + "flos": 18551704371840.0, + "grad_norm": 2.3555408011853523, + "language_loss": 0.76265335, + "learning_rate": 3.937607245387255e-06, + "loss": 0.78516424, + "num_input_tokens_seen": 18973175, + "step": 893, + "time_per_iteration": 2.828474760055542 + }, + { + "auxiliary_loss_clip": 0.01189959, + "auxiliary_loss_mlp": 0.01092905, + "balance_loss_clip": 1.05402637, + "balance_loss_mlp": 1.00902915, + "epoch": 0.10749714423134732, + "flos": 22707609903360.0, + "grad_norm": 1.9252963552730662, + "language_loss": 0.72123706, + "learning_rate": 3.937414046146455e-06, + "loss": 0.74406576, + "num_input_tokens_seen": 18991130, + "step": 894, + "time_per_iteration": 2.770158529281616 + }, + { + "auxiliary_loss_clip": 0.01208619, + "auxiliary_loss_mlp": 0.01097739, + "balance_loss_clip": 1.05688465, + "balance_loss_mlp": 1.01333916, + "epoch": 0.10761738712198642, + "flos": 21106066199040.0, + "grad_norm": 1.8613474790297817, + "language_loss": 0.74798083, + "learning_rate": 3.9372205530014845e-06, + "loss": 0.77104437, + "num_input_tokens_seen": 19009610, + "step": 895, + "time_per_iteration": 2.7191364765167236 + }, + { + "auxiliary_loss_clip": 0.01203819, + "auxiliary_loss_mlp": 0.01095576, + "balance_loss_clip": 1.05160642, + "balance_loss_mlp": 1.01131916, + "epoch": 0.1077376300126255, + "flos": 23766723348480.0, + "grad_norm": 3.788667659364793, + "language_loss": 0.71279085, + "learning_rate": 3.937026765981696e-06, + "loss": 0.73578477, + "num_input_tokens_seen": 19029680, + "step": 896, + "time_per_iteration": 2.6907308101654053 + }, + { + "auxiliary_loss_clip": 0.01171522, + "auxiliary_loss_mlp": 0.01094205, + "balance_loss_clip": 1.04934013, + "balance_loss_mlp": 1.00975728, + "epoch": 0.1078578729032646, + "flos": 20919581763840.0, + "grad_norm": 2.024032346853209, + "language_loss": 0.79520977, + "learning_rate": 3.936832685116488e-06, + "loss": 0.81786698, + "num_input_tokens_seen": 19047775, + "step": 897, + "time_per_iteration": 2.8592963218688965 + }, + { + "auxiliary_loss_clip": 0.01205677, + "auxiliary_loss_mlp": 0.01092918, + "balance_loss_clip": 1.05307651, + "balance_loss_mlp": 1.00880432, + "epoch": 0.10797811579390369, + "flos": 14829886702080.0, + "grad_norm": 2.271838066220001, + "language_loss": 0.89944166, + "learning_rate": 3.936638310435301e-06, + "loss": 0.92242759, + "num_input_tokens_seen": 19065640, + "step": 898, + "time_per_iteration": 2.6485137939453125 + }, + { + "auxiliary_loss_clip": 0.01192197, + "auxiliary_loss_mlp": 0.01092759, + "balance_loss_clip": 1.04902089, + "balance_loss_mlp": 1.0089314, + "epoch": 0.10809835868454278, + "flos": 19536985411200.0, + "grad_norm": 2.1863202543138027, + "language_loss": 0.81162763, + "learning_rate": 3.936443641967623e-06, + "loss": 0.83447719, + "num_input_tokens_seen": 19084470, + "step": 899, + "time_per_iteration": 2.69286847114563 + }, + { + "auxiliary_loss_clip": 0.01179693, + "auxiliary_loss_mlp": 0.01096069, + "balance_loss_clip": 1.04554367, + "balance_loss_mlp": 1.01185942, + "epoch": 0.10821860157518187, + "flos": 18442320480000.0, + "grad_norm": 2.347911671953346, + "language_loss": 0.83107626, + "learning_rate": 3.936248679742983e-06, + "loss": 0.85383385, + "num_input_tokens_seen": 19102965, + "step": 900, + "time_per_iteration": 2.7400643825531006 + }, + { + "auxiliary_loss_clip": 0.01153051, + "auxiliary_loss_mlp": 0.01081565, + "balance_loss_clip": 1.05549741, + "balance_loss_mlp": 1.00107503, + "epoch": 0.10833884446582095, + "flos": 49359468447360.0, + "grad_norm": 1.0596926197986027, + "language_loss": 0.70168948, + "learning_rate": 3.936053423790959e-06, + "loss": 0.72403568, + "num_input_tokens_seen": 19151285, + "step": 901, + "time_per_iteration": 3.101490020751953 + }, + { + "auxiliary_loss_clip": 0.01208158, + "auxiliary_loss_mlp": 0.01093346, + "balance_loss_clip": 1.05567122, + "balance_loss_mlp": 1.00951755, + "epoch": 0.10845908735646005, + "flos": 20411912891520.0, + "grad_norm": 1.7630978547961378, + "language_loss": 0.7730059, + "learning_rate": 3.935857874141168e-06, + "loss": 0.79602098, + "num_input_tokens_seen": 19170120, + "step": 902, + "time_per_iteration": 2.8999905586242676 + }, + { + "auxiliary_loss_clip": 0.01171439, + "auxiliary_loss_mlp": 0.01094224, + "balance_loss_clip": 1.0462079, + "balance_loss_mlp": 1.01006186, + "epoch": 0.10857933024709913, + "flos": 14027750133120.0, + "grad_norm": 2.505759370192402, + "language_loss": 0.83419675, + "learning_rate": 3.935662030823279e-06, + "loss": 0.85685337, + "num_input_tokens_seen": 19186305, + "step": 903, + "time_per_iteration": 2.7789957523345947 + }, + { + "auxiliary_loss_clip": 0.01190902, + "auxiliary_loss_mlp": 0.01095051, + "balance_loss_clip": 1.04805589, + "balance_loss_mlp": 1.01079392, + "epoch": 0.10869957313773823, + "flos": 13369004657280.0, + "grad_norm": 4.104319048325208, + "language_loss": 0.72566187, + "learning_rate": 3.935465893866998e-06, + "loss": 0.74852139, + "num_input_tokens_seen": 19204530, + "step": 904, + "time_per_iteration": 2.7690346240997314 + }, + { + "auxiliary_loss_clip": 0.01182757, + "auxiliary_loss_mlp": 0.01095364, + "balance_loss_clip": 1.0509845, + "balance_loss_mlp": 1.01125002, + "epoch": 0.10881981602837733, + "flos": 25807095509760.0, + "grad_norm": 2.0066940182131, + "language_loss": 0.80124009, + "learning_rate": 3.935269463302079e-06, + "loss": 0.82402128, + "num_input_tokens_seen": 19222735, + "step": 905, + "time_per_iteration": 2.8069686889648438 + }, + { + "auxiliary_loss_clip": 0.01193295, + "auxiliary_loss_mlp": 0.01094386, + "balance_loss_clip": 1.05024791, + "balance_loss_mlp": 1.01031983, + "epoch": 0.10894005891901641, + "flos": 20777555387520.0, + "grad_norm": 2.5834121811014064, + "language_loss": 0.76937371, + "learning_rate": 3.935072739158322e-06, + "loss": 0.79225051, + "num_input_tokens_seen": 19242445, + "step": 906, + "time_per_iteration": 3.706759214401245 + }, + { + "auxiliary_loss_clip": 0.01185329, + "auxiliary_loss_mlp": 0.01096738, + "balance_loss_clip": 1.05190635, + "balance_loss_mlp": 1.01248085, + "epoch": 0.10906030180965551, + "flos": 26649883296000.0, + "grad_norm": 1.6546791973446058, + "language_loss": 0.80124825, + "learning_rate": 3.934875721465569e-06, + "loss": 0.8240689, + "num_input_tokens_seen": 19262865, + "step": 907, + "time_per_iteration": 4.6912617683410645 + }, + { + "auxiliary_loss_clip": 0.01170978, + "auxiliary_loss_mlp": 0.0109391, + "balance_loss_clip": 1.05043828, + "balance_loss_mlp": 1.00941443, + "epoch": 0.10918054470029459, + "flos": 36534402420480.0, + "grad_norm": 2.3337529861461386, + "language_loss": 0.71780002, + "learning_rate": 3.9346784102537076e-06, + "loss": 0.74044889, + "num_input_tokens_seen": 19285000, + "step": 908, + "time_per_iteration": 2.8320751190185547 + }, + { + "auxiliary_loss_clip": 0.0120503, + "auxiliary_loss_mlp": 0.01092216, + "balance_loss_clip": 1.0525012, + "balance_loss_mlp": 1.00829232, + "epoch": 0.10930078759093369, + "flos": 21762549118080.0, + "grad_norm": 1.6974049088341037, + "language_loss": 0.77983433, + "learning_rate": 3.934480805552669e-06, + "loss": 0.80280679, + "num_input_tokens_seen": 19306010, + "step": 909, + "time_per_iteration": 2.71197509765625 + }, + { + "auxiliary_loss_clip": 0.01205306, + "auxiliary_loss_mlp": 0.00875422, + "balance_loss_clip": 1.05308533, + "balance_loss_mlp": 1.00037038, + "epoch": 0.10942103048157277, + "flos": 22601781457920.0, + "grad_norm": 2.056097903848824, + "language_loss": 0.87773168, + "learning_rate": 3.93428290739243e-06, + "loss": 0.89853895, + "num_input_tokens_seen": 19325380, + "step": 910, + "time_per_iteration": 3.6387760639190674 + }, + { + "auxiliary_loss_clip": 0.01185062, + "auxiliary_loss_mlp": 0.01093547, + "balance_loss_clip": 1.05131006, + "balance_loss_mlp": 1.00957656, + "epoch": 0.10954127337221187, + "flos": 15045781397760.0, + "grad_norm": 2.166724382539106, + "language_loss": 0.80027694, + "learning_rate": 3.9340847158030125e-06, + "loss": 0.82306302, + "num_input_tokens_seen": 19338960, + "step": 911, + "time_per_iteration": 2.7009012699127197 + }, + { + "auxiliary_loss_clip": 0.01198689, + "auxiliary_loss_mlp": 0.01094358, + "balance_loss_clip": 1.05514717, + "balance_loss_mlp": 1.01029181, + "epoch": 0.10966151626285096, + "flos": 21650974496640.0, + "grad_norm": 1.895887318332495, + "language_loss": 0.75515288, + "learning_rate": 3.9338862308144814e-06, + "loss": 0.77808332, + "num_input_tokens_seen": 19357780, + "step": 912, + "time_per_iteration": 2.8155674934387207 + }, + { + "auxiliary_loss_clip": 0.012087, + "auxiliary_loss_mlp": 0.01098098, + "balance_loss_clip": 1.05635142, + "balance_loss_mlp": 1.01393604, + "epoch": 0.10978175915349005, + "flos": 20121359777280.0, + "grad_norm": 1.5886520476360528, + "language_loss": 0.84405309, + "learning_rate": 3.933687452456946e-06, + "loss": 0.8671211, + "num_input_tokens_seen": 19377680, + "step": 913, + "time_per_iteration": 2.6646528244018555 + }, + { + "auxiliary_loss_clip": 0.01161597, + "auxiliary_loss_mlp": 0.01094347, + "balance_loss_clip": 1.04435062, + "balance_loss_mlp": 1.00980377, + "epoch": 0.10990200204412914, + "flos": 20412667077120.0, + "grad_norm": 1.9966120877899172, + "language_loss": 0.86291277, + "learning_rate": 3.933488380760562e-06, + "loss": 0.88547218, + "num_input_tokens_seen": 19397040, + "step": 914, + "time_per_iteration": 2.7228591442108154 + }, + { + "auxiliary_loss_clip": 0.01205286, + "auxiliary_loss_mlp": 0.00875435, + "balance_loss_clip": 1.05320644, + "balance_loss_mlp": 1.00038958, + "epoch": 0.11002224493476823, + "flos": 17530117660800.0, + "grad_norm": 2.0533428975858588, + "language_loss": 0.87257755, + "learning_rate": 3.9332890157555286e-06, + "loss": 0.8933847, + "num_input_tokens_seen": 19413975, + "step": 915, + "time_per_iteration": 2.6194939613342285 + }, + { + "auxiliary_loss_clip": 0.01188634, + "auxiliary_loss_mlp": 0.01095578, + "balance_loss_clip": 1.0546385, + "balance_loss_mlp": 1.0112251, + "epoch": 0.11014248782540732, + "flos": 12203093099520.0, + "grad_norm": 1.9637461697913658, + "language_loss": 0.76742822, + "learning_rate": 3.933089357472088e-06, + "loss": 0.79027033, + "num_input_tokens_seen": 19432005, + "step": 916, + "time_per_iteration": 2.77217698097229 + }, + { + "auxiliary_loss_clip": 0.01208313, + "auxiliary_loss_mlp": 0.01094907, + "balance_loss_clip": 1.05607796, + "balance_loss_mlp": 1.01079321, + "epoch": 0.11026273071604642, + "flos": 22382977760640.0, + "grad_norm": 1.9877164423309521, + "language_loss": 0.86091578, + "learning_rate": 3.932889405940529e-06, + "loss": 0.88394797, + "num_input_tokens_seen": 19450100, + "step": 917, + "time_per_iteration": 2.6900901794433594 + }, + { + "auxiliary_loss_clip": 0.01184481, + "auxiliary_loss_mlp": 0.01095603, + "balance_loss_clip": 1.05064845, + "balance_loss_mlp": 1.0116322, + "epoch": 0.1103829736066855, + "flos": 19829046896640.0, + "grad_norm": 2.1060614376662907, + "language_loss": 0.80122769, + "learning_rate": 3.932689161191184e-06, + "loss": 0.82402849, + "num_input_tokens_seen": 19467805, + "step": 918, + "time_per_iteration": 2.836111307144165 + }, + { + "auxiliary_loss_clip": 0.01197697, + "auxiliary_loss_mlp": 0.01093692, + "balance_loss_clip": 1.05385423, + "balance_loss_mlp": 1.00948215, + "epoch": 0.1105032164973246, + "flos": 22669616292480.0, + "grad_norm": 2.1396029036219733, + "language_loss": 0.8841182, + "learning_rate": 3.93248862325443e-06, + "loss": 0.90703207, + "num_input_tokens_seen": 19486710, + "step": 919, + "time_per_iteration": 2.763240098953247 + }, + { + "auxiliary_loss_clip": 0.01189849, + "auxiliary_loss_mlp": 0.01081621, + "balance_loss_clip": 1.05792546, + "balance_loss_mlp": 1.0011313, + "epoch": 0.11062345938796368, + "flos": 66483507876480.0, + "grad_norm": 0.9396005686965447, + "language_loss": 0.64467609, + "learning_rate": 3.932287792160688e-06, + "loss": 0.66739082, + "num_input_tokens_seen": 19545170, + "step": 920, + "time_per_iteration": 3.2042171955108643 + }, + { + "auxiliary_loss_clip": 0.01197841, + "auxiliary_loss_mlp": 0.01093024, + "balance_loss_clip": 1.05419183, + "balance_loss_mlp": 1.00871921, + "epoch": 0.11074370227860278, + "flos": 21907771804800.0, + "grad_norm": 2.9420028725412286, + "language_loss": 0.80731523, + "learning_rate": 3.932086667940424e-06, + "loss": 0.83022392, + "num_input_tokens_seen": 19561875, + "step": 921, + "time_per_iteration": 2.747638463973999 + }, + { + "auxiliary_loss_clip": 0.01194144, + "auxiliary_loss_mlp": 0.00875436, + "balance_loss_clip": 1.05202174, + "balance_loss_mlp": 1.00035381, + "epoch": 0.11086394516924186, + "flos": 28658115763200.0, + "grad_norm": 1.9395259684111967, + "language_loss": 0.81475234, + "learning_rate": 3.93188525062415e-06, + "loss": 0.83544815, + "num_input_tokens_seen": 19582340, + "step": 922, + "time_per_iteration": 2.813295602798462 + }, + { + "auxiliary_loss_clip": 0.01194218, + "auxiliary_loss_mlp": 0.01094651, + "balance_loss_clip": 1.05299759, + "balance_loss_mlp": 1.01029885, + "epoch": 0.11098418805988096, + "flos": 24535247765760.0, + "grad_norm": 1.8143875384688424, + "language_loss": 0.86036438, + "learning_rate": 3.931683540242418e-06, + "loss": 0.88325298, + "num_input_tokens_seen": 19603405, + "step": 923, + "time_per_iteration": 2.8795669078826904 + }, + { + "auxiliary_loss_clip": 0.01197108, + "auxiliary_loss_mlp": 0.010903, + "balance_loss_clip": 1.05379152, + "balance_loss_mlp": 1.00642407, + "epoch": 0.11110443095052006, + "flos": 22960384888320.0, + "grad_norm": 3.588084271705645, + "language_loss": 0.90858769, + "learning_rate": 3.9314815368258295e-06, + "loss": 0.93146181, + "num_input_tokens_seen": 19619885, + "step": 924, + "time_per_iteration": 2.7436611652374268 + }, + { + "auxiliary_loss_clip": 0.01194212, + "auxiliary_loss_mlp": 0.01096627, + "balance_loss_clip": 1.05206537, + "balance_loss_mlp": 1.01265597, + "epoch": 0.11122467384115914, + "flos": 18950025265920.0, + "grad_norm": 1.580818977639797, + "language_loss": 0.78489536, + "learning_rate": 3.9312792404050275e-06, + "loss": 0.80780375, + "num_input_tokens_seen": 19637940, + "step": 925, + "time_per_iteration": 2.7238545417785645 + }, + { + "auxiliary_loss_clip": 0.01205736, + "auxiliary_loss_mlp": 0.01096929, + "balance_loss_clip": 1.05397379, + "balance_loss_mlp": 1.0130055, + "epoch": 0.11134491673179824, + "flos": 25082957324160.0, + "grad_norm": 1.7363320491818335, + "language_loss": 0.77383661, + "learning_rate": 3.9310766510107e-06, + "loss": 0.79686332, + "num_input_tokens_seen": 19657115, + "step": 926, + "time_per_iteration": 2.7070653438568115 + }, + { + "auxiliary_loss_clip": 0.01165044, + "auxiliary_loss_mlp": 0.01094212, + "balance_loss_clip": 1.04586101, + "balance_loss_mlp": 1.00971627, + "epoch": 0.11146515962243732, + "flos": 24499121662080.0, + "grad_norm": 2.415535902790945, + "language_loss": 0.9222573, + "learning_rate": 3.9308737686735806e-06, + "loss": 0.94484985, + "num_input_tokens_seen": 19677075, + "step": 927, + "time_per_iteration": 2.7995965480804443 + }, + { + "auxiliary_loss_clip": 0.01206805, + "auxiliary_loss_mlp": 0.01094929, + "balance_loss_clip": 1.0552249, + "balance_loss_mlp": 1.01052856, + "epoch": 0.11158540251307641, + "flos": 22343763087360.0, + "grad_norm": 2.29634754200574, + "language_loss": 0.8309375, + "learning_rate": 3.9306705934244455e-06, + "loss": 0.85395485, + "num_input_tokens_seen": 19697155, + "step": 928, + "time_per_iteration": 2.715529441833496 + }, + { + "auxiliary_loss_clip": 0.01179898, + "auxiliary_loss_mlp": 0.01094088, + "balance_loss_clip": 1.04742885, + "balance_loss_mlp": 1.01035571, + "epoch": 0.11170564540371551, + "flos": 19902304684800.0, + "grad_norm": 1.6728122325959287, + "language_loss": 0.88499296, + "learning_rate": 3.930467125294116e-06, + "loss": 0.90773284, + "num_input_tokens_seen": 19716705, + "step": 929, + "time_per_iteration": 2.733630657196045 + }, + { + "auxiliary_loss_clip": 0.01124582, + "auxiliary_loss_mlp": 0.01081145, + "balance_loss_clip": 1.04442549, + "balance_loss_mlp": 1.00065458, + "epoch": 0.1118258882943546, + "flos": 64586239499520.0, + "grad_norm": 0.9253248664785304, + "language_loss": 0.60522234, + "learning_rate": 3.930263364313458e-06, + "loss": 0.62727964, + "num_input_tokens_seen": 19767275, + "step": 930, + "time_per_iteration": 3.217033624649048 + }, + { + "auxiliary_loss_clip": 0.0116329, + "auxiliary_loss_mlp": 0.01093666, + "balance_loss_clip": 1.04624629, + "balance_loss_mlp": 1.00964761, + "epoch": 0.11194613118499369, + "flos": 17201965985280.0, + "grad_norm": 2.042846442605933, + "language_loss": 0.836092, + "learning_rate": 3.930059310513384e-06, + "loss": 0.85866153, + "num_input_tokens_seen": 19786315, + "step": 931, + "time_per_iteration": 3.7258145809173584 + }, + { + "auxiliary_loss_clip": 0.01159809, + "auxiliary_loss_mlp": 0.00875322, + "balance_loss_clip": 1.0500381, + "balance_loss_mlp": 1.00035286, + "epoch": 0.11206637407563277, + "flos": 31863465728640.0, + "grad_norm": 1.774898840271245, + "language_loss": 0.84076285, + "learning_rate": 3.929854963924846e-06, + "loss": 0.86111414, + "num_input_tokens_seen": 19806580, + "step": 932, + "time_per_iteration": 4.744401216506958 + }, + { + "auxiliary_loss_clip": 0.01168296, + "auxiliary_loss_mlp": 0.01092586, + "balance_loss_clip": 1.04486918, + "balance_loss_mlp": 1.00866234, + "epoch": 0.11218661696627187, + "flos": 21945621761280.0, + "grad_norm": 1.754471332006682, + "language_loss": 0.77222788, + "learning_rate": 3.929650324578845e-06, + "loss": 0.7948367, + "num_input_tokens_seen": 19826045, + "step": 933, + "time_per_iteration": 2.8607897758483887 + }, + { + "auxiliary_loss_clip": 0.01182545, + "auxiliary_loss_mlp": 0.01094162, + "balance_loss_clip": 1.05023527, + "balance_loss_mlp": 1.00966632, + "epoch": 0.11230685985691095, + "flos": 25878198481920.0, + "grad_norm": 2.4423524613652137, + "language_loss": 0.82031822, + "learning_rate": 3.929445392506423e-06, + "loss": 0.84308529, + "num_input_tokens_seen": 19843985, + "step": 934, + "time_per_iteration": 2.8476061820983887 + }, + { + "auxiliary_loss_clip": 0.0119202, + "auxiliary_loss_mlp": 0.01096247, + "balance_loss_clip": 1.05191696, + "balance_loss_mlp": 1.01256204, + "epoch": 0.11242710274755005, + "flos": 22231506107520.0, + "grad_norm": 1.9708097212774647, + "language_loss": 0.76121092, + "learning_rate": 3.92924016773867e-06, + "loss": 0.78409356, + "num_input_tokens_seen": 19860480, + "step": 935, + "time_per_iteration": 3.709082841873169 + }, + { + "auxiliary_loss_clip": 0.01184641, + "auxiliary_loss_mlp": 0.00875305, + "balance_loss_clip": 1.05195498, + "balance_loss_mlp": 1.00026226, + "epoch": 0.11254734563818915, + "flos": 17712184723200.0, + "grad_norm": 2.2304519709451625, + "language_loss": 0.7316308, + "learning_rate": 3.9290346503067175e-06, + "loss": 0.75223029, + "num_input_tokens_seen": 19877145, + "step": 936, + "time_per_iteration": 2.754448413848877 + }, + { + "auxiliary_loss_clip": 0.01195305, + "auxiliary_loss_mlp": 0.01094687, + "balance_loss_clip": 1.05186605, + "balance_loss_mlp": 1.01071644, + "epoch": 0.11266758852882823, + "flos": 54930397334400.0, + "grad_norm": 1.7054303437705138, + "language_loss": 0.79017115, + "learning_rate": 3.9288288402417415e-06, + "loss": 0.81307107, + "num_input_tokens_seen": 19903405, + "step": 937, + "time_per_iteration": 3.0708394050598145 + }, + { + "auxiliary_loss_clip": 0.01193595, + "auxiliary_loss_mlp": 0.01095016, + "balance_loss_clip": 1.05256009, + "balance_loss_mlp": 1.01090193, + "epoch": 0.11278783141946733, + "flos": 18878132194560.0, + "grad_norm": 2.32168460556287, + "language_loss": 0.70624673, + "learning_rate": 3.928622737574964e-06, + "loss": 0.72913283, + "num_input_tokens_seen": 19918740, + "step": 938, + "time_per_iteration": 2.70715594291687 + }, + { + "auxiliary_loss_clip": 0.01183256, + "auxiliary_loss_mlp": 0.01094492, + "balance_loss_clip": 1.05108237, + "balance_loss_mlp": 1.01032972, + "epoch": 0.11290807431010641, + "flos": 26469252777600.0, + "grad_norm": 1.684493985010628, + "language_loss": 0.90940881, + "learning_rate": 3.928416342337652e-06, + "loss": 0.93218625, + "num_input_tokens_seen": 19938475, + "step": 939, + "time_per_iteration": 2.812314510345459 + }, + { + "auxiliary_loss_clip": 0.01182434, + "auxiliary_loss_mlp": 0.01096071, + "balance_loss_clip": 1.04903388, + "balance_loss_mlp": 1.0123862, + "epoch": 0.1130283172007455, + "flos": 22710590732160.0, + "grad_norm": 1.662254354638073, + "language_loss": 0.8292551, + "learning_rate": 3.928209654561113e-06, + "loss": 0.85204017, + "num_input_tokens_seen": 19959310, + "step": 940, + "time_per_iteration": 2.791959285736084 + }, + { + "auxiliary_loss_clip": 0.01178515, + "auxiliary_loss_mlp": 0.01096259, + "balance_loss_clip": 1.04863882, + "balance_loss_mlp": 1.01243138, + "epoch": 0.1131485600913846, + "flos": 23219911630080.0, + "grad_norm": 1.8467664504640218, + "language_loss": 0.81439066, + "learning_rate": 3.928002674276703e-06, + "loss": 0.8371383, + "num_input_tokens_seen": 19978700, + "step": 941, + "time_per_iteration": 2.751171588897705 + }, + { + "auxiliary_loss_clip": 0.01150138, + "auxiliary_loss_mlp": 0.01092382, + "balance_loss_clip": 1.04396629, + "balance_loss_mlp": 1.00831532, + "epoch": 0.11326880298202369, + "flos": 14064271286400.0, + "grad_norm": 2.329018116336668, + "language_loss": 0.75552773, + "learning_rate": 3.92779540151582e-06, + "loss": 0.77795291, + "num_input_tokens_seen": 19995785, + "step": 942, + "time_per_iteration": 2.7618658542633057 + }, + { + "auxiliary_loss_clip": 0.01176147, + "auxiliary_loss_mlp": 0.01093171, + "balance_loss_clip": 1.04904675, + "balance_loss_mlp": 1.00919986, + "epoch": 0.11338904587266278, + "flos": 16325386479360.0, + "grad_norm": 1.7099641396441239, + "language_loss": 0.85744125, + "learning_rate": 3.927587836309907e-06, + "loss": 0.88013446, + "num_input_tokens_seen": 20013615, + "step": 943, + "time_per_iteration": 2.756727457046509 + }, + { + "auxiliary_loss_clip": 0.01180608, + "auxiliary_loss_mlp": 0.01093262, + "balance_loss_clip": 1.04735708, + "balance_loss_mlp": 1.00924313, + "epoch": 0.11350928876330187, + "flos": 24426258923520.0, + "grad_norm": 1.9027505547897092, + "language_loss": 0.78256845, + "learning_rate": 3.927379978690452e-06, + "loss": 0.80530709, + "num_input_tokens_seen": 20032880, + "step": 944, + "time_per_iteration": 2.801417350769043 + }, + { + "auxiliary_loss_clip": 0.01176609, + "auxiliary_loss_mlp": 0.01092373, + "balance_loss_clip": 1.04976773, + "balance_loss_mlp": 1.00859261, + "epoch": 0.11362953165394096, + "flos": 24497074586880.0, + "grad_norm": 2.189214292388826, + "language_loss": 0.87277138, + "learning_rate": 3.927171828688987e-06, + "loss": 0.8954612, + "num_input_tokens_seen": 20052405, + "step": 945, + "time_per_iteration": 2.916654586791992 + }, + { + "auxiliary_loss_clip": 0.01203043, + "auxiliary_loss_mlp": 0.01093402, + "balance_loss_clip": 1.0527575, + "balance_loss_mlp": 1.00947881, + "epoch": 0.11374977454458005, + "flos": 24060831909120.0, + "grad_norm": 2.1798152764288794, + "language_loss": 0.82135904, + "learning_rate": 3.926963386337088e-06, + "loss": 0.84432352, + "num_input_tokens_seen": 20070635, + "step": 946, + "time_per_iteration": 2.866663694381714 + }, + { + "auxiliary_loss_clip": 0.01203181, + "auxiliary_loss_mlp": 0.01095578, + "balance_loss_clip": 1.0523864, + "balance_loss_mlp": 1.01132095, + "epoch": 0.11387001743521914, + "flos": 39457638967680.0, + "grad_norm": 2.5131781107089584, + "language_loss": 0.70068043, + "learning_rate": 3.926754651666375e-06, + "loss": 0.72366798, + "num_input_tokens_seen": 20091195, + "step": 947, + "time_per_iteration": 2.855957508087158 + }, + { + "auxiliary_loss_clip": 0.01170244, + "auxiliary_loss_mlp": 0.0109444, + "balance_loss_clip": 1.04883599, + "balance_loss_mlp": 1.010517, + "epoch": 0.11399026032585824, + "flos": 25082454533760.0, + "grad_norm": 2.6025634068465475, + "language_loss": 0.78293896, + "learning_rate": 3.926545624708513e-06, + "loss": 0.80558574, + "num_input_tokens_seen": 20110435, + "step": 948, + "time_per_iteration": 2.8670692443847656 + }, + { + "auxiliary_loss_clip": 0.01173546, + "auxiliary_loss_mlp": 0.01092871, + "balance_loss_clip": 1.04853821, + "balance_loss_mlp": 1.00904334, + "epoch": 0.11411050321649732, + "flos": 17961835224960.0, + "grad_norm": 2.1834316281704544, + "language_loss": 0.85652375, + "learning_rate": 3.926336305495213e-06, + "loss": 0.87918794, + "num_input_tokens_seen": 20128995, + "step": 949, + "time_per_iteration": 2.7963340282440186 + }, + { + "auxiliary_loss_clip": 0.0117528, + "auxiliary_loss_mlp": 0.01093288, + "balance_loss_clip": 1.04973757, + "balance_loss_mlp": 1.00945997, + "epoch": 0.11423074610713642, + "flos": 22455409536000.0, + "grad_norm": 1.9610677711601812, + "language_loss": 0.88754046, + "learning_rate": 3.926126694058226e-06, + "loss": 0.91022611, + "num_input_tokens_seen": 20148145, + "step": 950, + "time_per_iteration": 2.880511999130249 + }, + { + "auxiliary_loss_clip": 0.01149318, + "auxiliary_loss_mlp": 0.01094533, + "balance_loss_clip": 1.0382781, + "balance_loss_mlp": 1.01070476, + "epoch": 0.1143509889977755, + "flos": 19717687756800.0, + "grad_norm": 1.3829120635499836, + "language_loss": 0.82241434, + "learning_rate": 3.92591679042935e-06, + "loss": 0.84485281, + "num_input_tokens_seen": 20168035, + "step": 951, + "time_per_iteration": 2.8270304203033447 + }, + { + "auxiliary_loss_clip": 0.01185917, + "auxiliary_loss_mlp": 0.0109698, + "balance_loss_clip": 1.05081546, + "balance_loss_mlp": 1.01286626, + "epoch": 0.1144712318884146, + "flos": 19822869757440.0, + "grad_norm": 1.6811907425413564, + "language_loss": 0.82467371, + "learning_rate": 3.92570659464043e-06, + "loss": 0.84750265, + "num_input_tokens_seen": 20186095, + "step": 952, + "time_per_iteration": 2.716975212097168 + }, + { + "auxiliary_loss_clip": 0.0118849, + "auxiliary_loss_mlp": 0.00875261, + "balance_loss_clip": 1.04863417, + "balance_loss_mlp": 1.00023794, + "epoch": 0.1145914747790537, + "flos": 14939198766720.0, + "grad_norm": 1.9933971545135452, + "language_loss": 0.79856068, + "learning_rate": 3.925496106723349e-06, + "loss": 0.81919825, + "num_input_tokens_seen": 20203535, + "step": 953, + "time_per_iteration": 2.7264745235443115 + }, + { + "auxiliary_loss_clip": 0.01196757, + "auxiliary_loss_mlp": 0.01094249, + "balance_loss_clip": 1.05419672, + "balance_loss_mlp": 1.01046908, + "epoch": 0.11471171766969278, + "flos": 19865029345920.0, + "grad_norm": 2.11597660914389, + "language_loss": 0.83838618, + "learning_rate": 3.9252853267100405e-06, + "loss": 0.86129624, + "num_input_tokens_seen": 20222780, + "step": 954, + "time_per_iteration": 2.6787774562835693 + }, + { + "auxiliary_loss_clip": 0.01164533, + "auxiliary_loss_mlp": 0.0109582, + "balance_loss_clip": 1.04914618, + "balance_loss_mlp": 1.01170635, + "epoch": 0.11483196056033187, + "flos": 22526476594560.0, + "grad_norm": 2.055636639809084, + "language_loss": 0.83880997, + "learning_rate": 3.9250742546324786e-06, + "loss": 0.86141348, + "num_input_tokens_seen": 20243015, + "step": 955, + "time_per_iteration": 2.8449933528900146 + }, + { + "auxiliary_loss_clip": 0.01174444, + "auxiliary_loss_mlp": 0.01093511, + "balance_loss_clip": 1.04916811, + "balance_loss_mlp": 1.00963545, + "epoch": 0.11495220345097096, + "flos": 28220292887040.0, + "grad_norm": 1.67533857294898, + "language_loss": 0.86808223, + "learning_rate": 3.924862890522683e-06, + "loss": 0.89076173, + "num_input_tokens_seen": 20263025, + "step": 956, + "time_per_iteration": 3.649613380432129 + }, + { + "auxiliary_loss_clip": 0.01184324, + "auxiliary_loss_mlp": 0.01093001, + "balance_loss_clip": 1.04991055, + "balance_loss_mlp": 1.00874376, + "epoch": 0.11507244634161005, + "flos": 17492267704320.0, + "grad_norm": 2.2156541829297773, + "language_loss": 0.8647424, + "learning_rate": 3.9246512344127174e-06, + "loss": 0.88751566, + "num_input_tokens_seen": 20280685, + "step": 957, + "time_per_iteration": 4.626638412475586 + }, + { + "auxiliary_loss_clip": 0.01125322, + "auxiliary_loss_mlp": 0.01095272, + "balance_loss_clip": 1.03819966, + "balance_loss_mlp": 1.01130068, + "epoch": 0.11519268923224914, + "flos": 22564937082240.0, + "grad_norm": 3.076506118708893, + "language_loss": 0.81837684, + "learning_rate": 3.9244392863346895e-06, + "loss": 0.84058279, + "num_input_tokens_seen": 20300090, + "step": 958, + "time_per_iteration": 2.858883857727051 + }, + { + "auxiliary_loss_clip": 0.01177032, + "auxiliary_loss_mlp": 0.01095418, + "balance_loss_clip": 1.04712713, + "balance_loss_mlp": 1.01116049, + "epoch": 0.11531293212288823, + "flos": 16982839065600.0, + "grad_norm": 1.865705781853592, + "language_loss": 0.92521477, + "learning_rate": 3.9242270463207524e-06, + "loss": 0.94793928, + "num_input_tokens_seen": 20318480, + "step": 959, + "time_per_iteration": 2.724731683731079 + }, + { + "auxiliary_loss_clip": 0.01144441, + "auxiliary_loss_mlp": 0.01092808, + "balance_loss_clip": 1.03995562, + "balance_loss_mlp": 1.00878906, + "epoch": 0.11543317501352733, + "flos": 12422004537600.0, + "grad_norm": 3.0798611567275844, + "language_loss": 0.85197282, + "learning_rate": 3.924014514403102e-06, + "loss": 0.8743453, + "num_input_tokens_seen": 20334635, + "step": 960, + "time_per_iteration": 3.7571418285369873 + }, + { + "auxiliary_loss_clip": 0.01147039, + "auxiliary_loss_mlp": 0.0109292, + "balance_loss_clip": 1.04148364, + "balance_loss_mlp": 1.00885367, + "epoch": 0.11555341790416641, + "flos": 19821648695040.0, + "grad_norm": 1.9824940377243292, + "language_loss": 0.91047233, + "learning_rate": 3.92380169061398e-06, + "loss": 0.93287194, + "num_input_tokens_seen": 20352415, + "step": 961, + "time_per_iteration": 2.9068446159362793 + }, + { + "auxiliary_loss_clip": 0.01166016, + "auxiliary_loss_mlp": 0.00875191, + "balance_loss_clip": 1.04716992, + "balance_loss_mlp": 1.00015807, + "epoch": 0.11567366079480551, + "flos": 25738865625600.0, + "grad_norm": 2.3882049433141965, + "language_loss": 0.83718568, + "learning_rate": 3.9235885749856705e-06, + "loss": 0.85759783, + "num_input_tokens_seen": 20371095, + "step": 962, + "time_per_iteration": 2.8129162788391113 + }, + { + "auxiliary_loss_clip": 0.01178982, + "auxiliary_loss_mlp": 0.01092555, + "balance_loss_clip": 1.0494411, + "balance_loss_mlp": 1.008775, + "epoch": 0.1157939036854446, + "flos": 18223301301120.0, + "grad_norm": 1.6877272314595306, + "language_loss": 0.82393378, + "learning_rate": 3.9233751675505035e-06, + "loss": 0.84664917, + "num_input_tokens_seen": 20389805, + "step": 963, + "time_per_iteration": 2.705437660217285 + }, + { + "auxiliary_loss_clip": 0.01184692, + "auxiliary_loss_mlp": 0.01093561, + "balance_loss_clip": 1.05187786, + "balance_loss_mlp": 1.00973356, + "epoch": 0.11591414657608369, + "flos": 23073755189760.0, + "grad_norm": 1.9776184639877992, + "language_loss": 0.84719807, + "learning_rate": 3.923161468340853e-06, + "loss": 0.86998057, + "num_input_tokens_seen": 20409640, + "step": 964, + "time_per_iteration": 2.732332944869995 + }, + { + "auxiliary_loss_clip": 0.01155432, + "auxiliary_loss_mlp": 0.01094595, + "balance_loss_clip": 1.04778457, + "balance_loss_mlp": 1.01043344, + "epoch": 0.11603438946672277, + "flos": 19461716461440.0, + "grad_norm": 1.6415554617365524, + "language_loss": 0.81594622, + "learning_rate": 3.9229474773891374e-06, + "loss": 0.83844644, + "num_input_tokens_seen": 20428180, + "step": 965, + "time_per_iteration": 2.867067813873291 + }, + { + "auxiliary_loss_clip": 0.01175244, + "auxiliary_loss_mlp": 0.01093791, + "balance_loss_clip": 1.05000496, + "balance_loss_mlp": 1.00953424, + "epoch": 0.11615463235736187, + "flos": 26831986272000.0, + "grad_norm": 8.801357922390372, + "language_loss": 0.83881611, + "learning_rate": 3.922733194727818e-06, + "loss": 0.86150646, + "num_input_tokens_seen": 20447975, + "step": 966, + "time_per_iteration": 2.909536600112915 + }, + { + "auxiliary_loss_clip": 0.0119319, + "auxiliary_loss_mlp": 0.01096155, + "balance_loss_clip": 1.05159748, + "balance_loss_mlp": 1.01208901, + "epoch": 0.11627487524800097, + "flos": 18580324533120.0, + "grad_norm": 1.9607874821768294, + "language_loss": 0.87525439, + "learning_rate": 3.922518620389402e-06, + "loss": 0.89814782, + "num_input_tokens_seen": 20464840, + "step": 967, + "time_per_iteration": 2.6903319358825684 + }, + { + "auxiliary_loss_clip": 0.01117169, + "auxiliary_loss_mlp": 0.01094555, + "balance_loss_clip": 1.04095936, + "balance_loss_mlp": 1.01096523, + "epoch": 0.11639511813864005, + "flos": 18150474476160.0, + "grad_norm": 1.7326714543188537, + "language_loss": 0.89568603, + "learning_rate": 3.922303754406439e-06, + "loss": 0.91780329, + "num_input_tokens_seen": 20482680, + "step": 968, + "time_per_iteration": 2.8712151050567627 + }, + { + "auxiliary_loss_clip": 0.01167488, + "auxiliary_loss_mlp": 0.01093789, + "balance_loss_clip": 1.05005479, + "balance_loss_mlp": 1.00977051, + "epoch": 0.11651536102927915, + "flos": 20922023888640.0, + "grad_norm": 2.185622194154668, + "language_loss": 0.79060435, + "learning_rate": 3.922088596811526e-06, + "loss": 0.81321716, + "num_input_tokens_seen": 20501810, + "step": 969, + "time_per_iteration": 2.767655611038208 + }, + { + "auxiliary_loss_clip": 0.01183843, + "auxiliary_loss_mlp": 0.01093988, + "balance_loss_clip": 1.05017674, + "balance_loss_mlp": 1.01006436, + "epoch": 0.11663560391991823, + "flos": 16508602776960.0, + "grad_norm": 2.9102984605805644, + "language_loss": 0.86722052, + "learning_rate": 3.9218731476373e-06, + "loss": 0.88999885, + "num_input_tokens_seen": 20517995, + "step": 970, + "time_per_iteration": 2.707728624343872 + }, + { + "auxiliary_loss_clip": 0.01191801, + "auxiliary_loss_mlp": 0.01092745, + "balance_loss_clip": 1.05199277, + "balance_loss_mlp": 1.0084877, + "epoch": 0.11675584681055733, + "flos": 19865029345920.0, + "grad_norm": 2.1114670707988488, + "language_loss": 0.84936523, + "learning_rate": 3.9216574069164455e-06, + "loss": 0.87221068, + "num_input_tokens_seen": 20536970, + "step": 971, + "time_per_iteration": 2.7035715579986572 + }, + { + "auxiliary_loss_clip": 0.01201824, + "auxiliary_loss_mlp": 0.0109132, + "balance_loss_clip": 1.05232096, + "balance_loss_mlp": 1.00782597, + "epoch": 0.11687608970119642, + "flos": 21944364785280.0, + "grad_norm": 1.6292293590260918, + "language_loss": 0.80279756, + "learning_rate": 3.921441374681691e-06, + "loss": 0.82572901, + "num_input_tokens_seen": 20557030, + "step": 972, + "time_per_iteration": 2.7269694805145264 + }, + { + "auxiliary_loss_clip": 0.01170772, + "auxiliary_loss_mlp": 0.01097207, + "balance_loss_clip": 1.04343164, + "balance_loss_mlp": 1.01333117, + "epoch": 0.1169963325918355, + "flos": 24061155131520.0, + "grad_norm": 1.829538671992596, + "language_loss": 0.65013951, + "learning_rate": 3.921225050965808e-06, + "loss": 0.67281926, + "num_input_tokens_seen": 20576915, + "step": 973, + "time_per_iteration": 2.772761344909668 + }, + { + "auxiliary_loss_clip": 0.01165252, + "auxiliary_loss_mlp": 0.01092875, + "balance_loss_clip": 1.04904962, + "balance_loss_mlp": 1.00899935, + "epoch": 0.1171165754824746, + "flos": 23368151059200.0, + "grad_norm": 2.1403222777300246, + "language_loss": 0.75097477, + "learning_rate": 3.921008435801612e-06, + "loss": 0.77355599, + "num_input_tokens_seen": 20596000, + "step": 974, + "time_per_iteration": 2.820530652999878 + }, + { + "auxiliary_loss_clip": 0.01184463, + "auxiliary_loss_mlp": 0.01094477, + "balance_loss_clip": 1.05140841, + "balance_loss_mlp": 1.01064897, + "epoch": 0.11723681837311369, + "flos": 18552243075840.0, + "grad_norm": 2.2236453815885744, + "language_loss": 0.75762045, + "learning_rate": 3.920791529221963e-06, + "loss": 0.78040981, + "num_input_tokens_seen": 20614675, + "step": 975, + "time_per_iteration": 2.692223072052002 + }, + { + "auxiliary_loss_clip": 0.01178953, + "auxiliary_loss_mlp": 0.00875293, + "balance_loss_clip": 1.04742193, + "balance_loss_mlp": 1.00020766, + "epoch": 0.11735706126375278, + "flos": 23550541344000.0, + "grad_norm": 1.9314252651835961, + "language_loss": 0.76507396, + "learning_rate": 3.920574331259768e-06, + "loss": 0.7856164, + "num_input_tokens_seen": 20635875, + "step": 976, + "time_per_iteration": 2.733140468597412 + }, + { + "auxiliary_loss_clip": 0.01176583, + "auxiliary_loss_mlp": 0.0109316, + "balance_loss_clip": 1.05005002, + "balance_loss_mlp": 1.00942695, + "epoch": 0.11747730415439187, + "flos": 22381541216640.0, + "grad_norm": 2.182838070242362, + "language_loss": 0.79501116, + "learning_rate": 3.9203568419479716e-06, + "loss": 0.81770861, + "num_input_tokens_seen": 20656430, + "step": 977, + "time_per_iteration": 2.780970573425293 + }, + { + "auxiliary_loss_clip": 0.01183615, + "auxiliary_loss_mlp": 0.01093931, + "balance_loss_clip": 1.05183566, + "balance_loss_mlp": 1.01010299, + "epoch": 0.11759754704503096, + "flos": 22200731130240.0, + "grad_norm": 1.782430972363112, + "language_loss": 0.75088978, + "learning_rate": 3.92013906131957e-06, + "loss": 0.77366519, + "num_input_tokens_seen": 20675360, + "step": 978, + "time_per_iteration": 2.7290730476379395 + }, + { + "auxiliary_loss_clip": 0.01165892, + "auxiliary_loss_mlp": 0.01093363, + "balance_loss_clip": 1.04531717, + "balance_loss_mlp": 1.00972593, + "epoch": 0.11771778993567006, + "flos": 22309755886080.0, + "grad_norm": 1.938760049976217, + "language_loss": 0.82515836, + "learning_rate": 3.9199209894076e-06, + "loss": 0.8477509, + "num_input_tokens_seen": 20695675, + "step": 979, + "time_per_iteration": 2.808297634124756 + }, + { + "auxiliary_loss_clip": 0.012003, + "auxiliary_loss_mlp": 0.01092343, + "balance_loss_clip": 1.05107951, + "balance_loss_mlp": 1.00799036, + "epoch": 0.11783803282630914, + "flos": 21288169175040.0, + "grad_norm": 1.8028022835609656, + "language_loss": 0.90239549, + "learning_rate": 3.919702626245142e-06, + "loss": 0.92532194, + "num_input_tokens_seen": 20715330, + "step": 980, + "time_per_iteration": 2.6877009868621826 + }, + { + "auxiliary_loss_clip": 0.01171144, + "auxiliary_loss_mlp": 0.01094853, + "balance_loss_clip": 1.04816604, + "balance_loss_mlp": 1.01102507, + "epoch": 0.11795827571694824, + "flos": 25371535190400.0, + "grad_norm": 2.1075683087862584, + "language_loss": 0.661062, + "learning_rate": 3.919483971865322e-06, + "loss": 0.68372196, + "num_input_tokens_seen": 20735325, + "step": 981, + "time_per_iteration": 2.7552523612976074 + }, + { + "auxiliary_loss_clip": 0.01173306, + "auxiliary_loss_mlp": 0.01093611, + "balance_loss_clip": 1.045048, + "balance_loss_mlp": 1.0096879, + "epoch": 0.11807851860758732, + "flos": 23622218933760.0, + "grad_norm": 1.917797772143155, + "language_loss": 0.88044143, + "learning_rate": 3.91926502630131e-06, + "loss": 0.90311062, + "num_input_tokens_seen": 20755940, + "step": 982, + "time_per_iteration": 4.397935628890991 + }, + { + "auxiliary_loss_clip": 0.01195181, + "auxiliary_loss_mlp": 0.0109189, + "balance_loss_clip": 1.05409026, + "balance_loss_mlp": 1.0083487, + "epoch": 0.11819876149822642, + "flos": 24972496024320.0, + "grad_norm": 1.838464967658709, + "language_loss": 0.72371161, + "learning_rate": 3.91904578958632e-06, + "loss": 0.74658233, + "num_input_tokens_seen": 20775355, + "step": 983, + "time_per_iteration": 3.5728683471679688 + }, + { + "auxiliary_loss_clip": 0.01202585, + "auxiliary_loss_mlp": 0.01091059, + "balance_loss_clip": 1.05309224, + "balance_loss_mlp": 1.00699282, + "epoch": 0.11831900438886551, + "flos": 23003226835200.0, + "grad_norm": 7.65136633125005, + "language_loss": 0.84482145, + "learning_rate": 3.918826261753608e-06, + "loss": 0.86775792, + "num_input_tokens_seen": 20794935, + "step": 984, + "time_per_iteration": 2.657935380935669 + }, + { + "auxiliary_loss_clip": 0.01174473, + "auxiliary_loss_mlp": 0.01092054, + "balance_loss_clip": 1.04449868, + "balance_loss_mlp": 1.00865543, + "epoch": 0.1184392472795046, + "flos": 27965147604480.0, + "grad_norm": 2.6101935486738315, + "language_loss": 0.71384203, + "learning_rate": 3.918606442836478e-06, + "loss": 0.7365073, + "num_input_tokens_seen": 20817155, + "step": 985, + "time_per_iteration": 2.861546516418457 + }, + { + "auxiliary_loss_clip": 0.01189747, + "auxiliary_loss_mlp": 0.01093276, + "balance_loss_clip": 1.05026376, + "balance_loss_mlp": 1.00968683, + "epoch": 0.1185594901701437, + "flos": 19898497843200.0, + "grad_norm": 2.224892926939024, + "language_loss": 0.77609384, + "learning_rate": 3.918386332868277e-06, + "loss": 0.79892409, + "num_input_tokens_seen": 20835125, + "step": 986, + "time_per_iteration": 3.630741834640503 + }, + { + "auxiliary_loss_clip": 0.01184891, + "auxiliary_loss_mlp": 0.01094274, + "balance_loss_clip": 1.05176544, + "balance_loss_mlp": 1.01020789, + "epoch": 0.11867973306078278, + "flos": 18912354877440.0, + "grad_norm": 2.8135113904746856, + "language_loss": 0.94270122, + "learning_rate": 3.918165931882394e-06, + "loss": 0.9654929, + "num_input_tokens_seen": 20853525, + "step": 987, + "time_per_iteration": 2.72196626663208 + }, + { + "auxiliary_loss_clip": 0.0114138, + "auxiliary_loss_mlp": 0.0109267, + "balance_loss_clip": 1.04682875, + "balance_loss_mlp": 1.00912774, + "epoch": 0.11879997595142187, + "flos": 16982803152000.0, + "grad_norm": 2.4045741812150085, + "language_loss": 0.7541762, + "learning_rate": 3.917945239912264e-06, + "loss": 0.77651668, + "num_input_tokens_seen": 20871000, + "step": 988, + "time_per_iteration": 2.9777004718780518 + }, + { + "auxiliary_loss_clip": 0.01154678, + "auxiliary_loss_mlp": 0.01092638, + "balance_loss_clip": 1.04485571, + "balance_loss_mlp": 1.0090003, + "epoch": 0.11892021884206096, + "flos": 17530369056000.0, + "grad_norm": 2.3908347941400256, + "language_loss": 0.76279056, + "learning_rate": 3.917724256991367e-06, + "loss": 0.78526372, + "num_input_tokens_seen": 20889745, + "step": 989, + "time_per_iteration": 2.8946175575256348 + }, + { + "auxiliary_loss_clip": 0.01173162, + "auxiliary_loss_mlp": 0.01093625, + "balance_loss_clip": 1.04992235, + "balance_loss_mlp": 1.00970149, + "epoch": 0.11904046173270005, + "flos": 30955895763840.0, + "grad_norm": 2.6185641579157535, + "language_loss": 0.81521893, + "learning_rate": 3.9175029831532245e-06, + "loss": 0.83788681, + "num_input_tokens_seen": 20909260, + "step": 990, + "time_per_iteration": 2.878391742706299 + }, + { + "auxiliary_loss_clip": 0.01170536, + "auxiliary_loss_mlp": 0.0109815, + "balance_loss_clip": 1.04860258, + "balance_loss_mlp": 1.01422715, + "epoch": 0.11916070462333915, + "flos": 20157234485760.0, + "grad_norm": 2.100167457553216, + "language_loss": 0.88583994, + "learning_rate": 3.917281418431404e-06, + "loss": 0.90852678, + "num_input_tokens_seen": 20928305, + "step": 991, + "time_per_iteration": 2.8211166858673096 + }, + { + "auxiliary_loss_clip": 0.01177709, + "auxiliary_loss_mlp": 0.01093228, + "balance_loss_clip": 1.04939735, + "balance_loss_mlp": 1.00973344, + "epoch": 0.11928094751397823, + "flos": 23551115961600.0, + "grad_norm": 5.043612725622442, + "language_loss": 0.77011693, + "learning_rate": 3.917059562859516e-06, + "loss": 0.79282629, + "num_input_tokens_seen": 20947630, + "step": 992, + "time_per_iteration": 2.8054587841033936 + }, + { + "auxiliary_loss_clip": 0.01184157, + "auxiliary_loss_mlp": 0.01092047, + "balance_loss_clip": 1.05266452, + "balance_loss_mlp": 1.00855327, + "epoch": 0.11940119040461733, + "flos": 23908426502400.0, + "grad_norm": 2.3586313783143082, + "language_loss": 0.8846603, + "learning_rate": 3.916837416471218e-06, + "loss": 0.90742236, + "num_input_tokens_seen": 20964250, + "step": 993, + "time_per_iteration": 2.792512893676758 + }, + { + "auxiliary_loss_clip": 0.01193678, + "auxiliary_loss_mlp": 0.01092243, + "balance_loss_clip": 1.05195045, + "balance_loss_mlp": 1.00855839, + "epoch": 0.11952143329525641, + "flos": 13844533835520.0, + "grad_norm": 2.408065291279568, + "language_loss": 0.72176582, + "learning_rate": 3.916614979300207e-06, + "loss": 0.74462503, + "num_input_tokens_seen": 20979095, + "step": 994, + "time_per_iteration": 2.731060266494751 + }, + { + "auxiliary_loss_clip": 0.01150654, + "auxiliary_loss_mlp": 0.01094505, + "balance_loss_clip": 1.04666364, + "balance_loss_mlp": 1.01086736, + "epoch": 0.11964167618589551, + "flos": 27015525792000.0, + "grad_norm": 1.5924495446791518, + "language_loss": 0.7901665, + "learning_rate": 3.9163922513802274e-06, + "loss": 0.81261802, + "num_input_tokens_seen": 21001430, + "step": 995, + "time_per_iteration": 2.880443572998047 + }, + { + "auxiliary_loss_clip": 0.01201104, + "auxiliary_loss_mlp": 0.01093553, + "balance_loss_clip": 1.05070078, + "balance_loss_mlp": 1.00943863, + "epoch": 0.1197619190765346, + "flos": 12567622273920.0, + "grad_norm": 2.6859498353442417, + "language_loss": 0.82412803, + "learning_rate": 3.916169232745067e-06, + "loss": 0.84707463, + "num_input_tokens_seen": 21019105, + "step": 996, + "time_per_iteration": 2.679321050643921 + }, + { + "auxiliary_loss_clip": 0.01168693, + "auxiliary_loss_mlp": 0.01095654, + "balance_loss_clip": 1.04727674, + "balance_loss_mlp": 1.01115835, + "epoch": 0.11988216196717369, + "flos": 16909437623040.0, + "grad_norm": 3.79958017777432, + "language_loss": 0.91966993, + "learning_rate": 3.915945923428559e-06, + "loss": 0.94231337, + "num_input_tokens_seen": 21035630, + "step": 997, + "time_per_iteration": 2.7221438884735107 + }, + { + "auxiliary_loss_clip": 0.01193324, + "auxiliary_loss_mlp": 0.01093195, + "balance_loss_clip": 1.05117369, + "balance_loss_mlp": 1.00927138, + "epoch": 0.12000240485781279, + "flos": 16216577205120.0, + "grad_norm": 2.0652453164733666, + "language_loss": 0.82959557, + "learning_rate": 3.915722323464577e-06, + "loss": 0.85246068, + "num_input_tokens_seen": 21054235, + "step": 998, + "time_per_iteration": 2.688628673553467 + }, + { + "auxiliary_loss_clip": 0.0119127, + "auxiliary_loss_mlp": 0.01093448, + "balance_loss_clip": 1.05157685, + "balance_loss_mlp": 1.00933433, + "epoch": 0.12012264774845187, + "flos": 49344887525760.0, + "grad_norm": 3.681746378477176, + "language_loss": 0.701657, + "learning_rate": 3.91549843288704e-06, + "loss": 0.72450423, + "num_input_tokens_seen": 21077915, + "step": 999, + "time_per_iteration": 3.0313639640808105 + }, + { + "auxiliary_loss_clip": 0.01174713, + "auxiliary_loss_mlp": 0.00875167, + "balance_loss_clip": 1.04930627, + "balance_loss_mlp": 1.00017679, + "epoch": 0.12024289063909097, + "flos": 26979435601920.0, + "grad_norm": 2.058639355287257, + "language_loss": 0.78943729, + "learning_rate": 3.915274251729916e-06, + "loss": 0.80993605, + "num_input_tokens_seen": 21099205, + "step": 1000, + "time_per_iteration": 2.849956750869751 + }, + { + "auxiliary_loss_clip": 0.01163519, + "auxiliary_loss_mlp": 0.01096632, + "balance_loss_clip": 1.04235792, + "balance_loss_mlp": 1.0126605, + "epoch": 0.12036313352973005, + "flos": 19537308633600.0, + "grad_norm": 1.8897309411343581, + "language_loss": 0.90413928, + "learning_rate": 3.91504978002721e-06, + "loss": 0.92674077, + "num_input_tokens_seen": 21118260, + "step": 1001, + "time_per_iteration": 2.802706241607666 + }, + { + "auxiliary_loss_clip": 0.01187213, + "auxiliary_loss_mlp": 0.00875126, + "balance_loss_clip": 1.05378044, + "balance_loss_mlp": 1.00014722, + "epoch": 0.12048337642036915, + "flos": 17268256535040.0, + "grad_norm": 1.9488249466197367, + "language_loss": 0.76041234, + "learning_rate": 3.914825017812974e-06, + "loss": 0.78103572, + "num_input_tokens_seen": 21134910, + "step": 1002, + "time_per_iteration": 2.7478318214416504 + }, + { + "auxiliary_loss_clip": 0.01184437, + "auxiliary_loss_mlp": 0.01097077, + "balance_loss_clip": 1.05329847, + "balance_loss_mlp": 1.01310611, + "epoch": 0.12060361931100824, + "flos": 22856962654080.0, + "grad_norm": 2.212655498748911, + "language_loss": 0.72541636, + "learning_rate": 3.9145999651213065e-06, + "loss": 0.74823153, + "num_input_tokens_seen": 21154150, + "step": 1003, + "time_per_iteration": 2.8134515285491943 + }, + { + "auxiliary_loss_clip": 0.01191559, + "auxiliary_loss_mlp": 0.01094133, + "balance_loss_clip": 1.05172241, + "balance_loss_mlp": 1.01011467, + "epoch": 0.12072386220164733, + "flos": 16726795943040.0, + "grad_norm": 3.5220323938741744, + "language_loss": 0.88613248, + "learning_rate": 3.9143746219863465e-06, + "loss": 0.90898937, + "num_input_tokens_seen": 21171255, + "step": 1004, + "time_per_iteration": 2.7387733459472656 + }, + { + "auxiliary_loss_clip": 0.01196888, + "auxiliary_loss_mlp": 0.01081311, + "balance_loss_clip": 1.0754571, + "balance_loss_mlp": 1.00082135, + "epoch": 0.12084410509228642, + "flos": 55144176105600.0, + "grad_norm": 0.9426887533437953, + "language_loss": 0.64797449, + "learning_rate": 3.914148988442278e-06, + "loss": 0.67075646, + "num_input_tokens_seen": 21227045, + "step": 1005, + "time_per_iteration": 3.327705144882202 + }, + { + "auxiliary_loss_clip": 0.01170777, + "auxiliary_loss_mlp": 0.01093181, + "balance_loss_clip": 1.05000603, + "balance_loss_mlp": 1.0090673, + "epoch": 0.1209643479829255, + "flos": 26760236855040.0, + "grad_norm": 2.3620431781687943, + "language_loss": 0.95011652, + "learning_rate": 3.91392306452333e-06, + "loss": 0.97275615, + "num_input_tokens_seen": 21244120, + "step": 1006, + "time_per_iteration": 2.8850386142730713 + }, + { + "auxiliary_loss_clip": 0.0120379, + "auxiliary_loss_mlp": 0.01094302, + "balance_loss_clip": 1.05421495, + "balance_loss_mlp": 1.00990224, + "epoch": 0.1210845908735646, + "flos": 11035026725760.0, + "grad_norm": 2.9705439220977605, + "language_loss": 0.66327721, + "learning_rate": 3.913696850263774e-06, + "loss": 0.68625814, + "num_input_tokens_seen": 21258485, + "step": 1007, + "time_per_iteration": 4.57569694519043 + }, + { + "auxiliary_loss_clip": 0.01195118, + "auxiliary_loss_mlp": 0.01095489, + "balance_loss_clip": 1.05313516, + "balance_loss_mlp": 1.01166141, + "epoch": 0.1212048337642037, + "flos": 20484631975680.0, + "grad_norm": 2.0801060765422985, + "language_loss": 0.79036105, + "learning_rate": 3.913470345697929e-06, + "loss": 0.81326711, + "num_input_tokens_seen": 21277115, + "step": 1008, + "time_per_iteration": 2.6972639560699463 + }, + { + "auxiliary_loss_clip": 0.01156779, + "auxiliary_loss_mlp": 0.01095954, + "balance_loss_clip": 1.04472327, + "balance_loss_mlp": 1.01207805, + "epoch": 0.12132507665484278, + "flos": 22346061557760.0, + "grad_norm": 2.123393798013024, + "language_loss": 0.85301399, + "learning_rate": 3.913243550860153e-06, + "loss": 0.87554139, + "num_input_tokens_seen": 21294880, + "step": 1009, + "time_per_iteration": 3.8367457389831543 + }, + { + "auxiliary_loss_clip": 0.01194538, + "auxiliary_loss_mlp": 0.01094457, + "balance_loss_clip": 1.0552218, + "balance_loss_mlp": 1.0106287, + "epoch": 0.12144531954548188, + "flos": 29314957818240.0, + "grad_norm": 1.8827677861260457, + "language_loss": 0.7628845, + "learning_rate": 3.913016465784852e-06, + "loss": 0.78577441, + "num_input_tokens_seen": 21315555, + "step": 1010, + "time_per_iteration": 2.7366116046905518 + }, + { + "auxiliary_loss_clip": 0.01150553, + "auxiliary_loss_mlp": 0.01093887, + "balance_loss_clip": 1.04952121, + "balance_loss_mlp": 1.00967717, + "epoch": 0.12156556243612096, + "flos": 20485242506880.0, + "grad_norm": 2.3688042175111894, + "language_loss": 0.71792734, + "learning_rate": 3.912789090506474e-06, + "loss": 0.74037176, + "num_input_tokens_seen": 21334815, + "step": 1011, + "time_per_iteration": 2.820451498031616 + }, + { + "auxiliary_loss_clip": 0.01172135, + "auxiliary_loss_mlp": 0.01094248, + "balance_loss_clip": 1.04889345, + "balance_loss_mlp": 1.01013434, + "epoch": 0.12168580532676006, + "flos": 16472009796480.0, + "grad_norm": 2.231137299088491, + "language_loss": 0.71769881, + "learning_rate": 3.9125614250595114e-06, + "loss": 0.7403627, + "num_input_tokens_seen": 21351025, + "step": 1012, + "time_per_iteration": 3.7157142162323 + }, + { + "auxiliary_loss_clip": 0.01193943, + "auxiliary_loss_mlp": 0.01094089, + "balance_loss_clip": 1.05400825, + "balance_loss_mlp": 1.01006985, + "epoch": 0.12180604821739914, + "flos": 15341290588800.0, + "grad_norm": 2.636270026528687, + "language_loss": 0.88883555, + "learning_rate": 3.912333469478502e-06, + "loss": 0.91171587, + "num_input_tokens_seen": 21368990, + "step": 1013, + "time_per_iteration": 2.7087841033935547 + }, + { + "auxiliary_loss_clip": 0.01186319, + "auxiliary_loss_mlp": 0.0109479, + "balance_loss_clip": 1.05356574, + "balance_loss_mlp": 1.01067615, + "epoch": 0.12192629110803824, + "flos": 19318038059520.0, + "grad_norm": 4.312670593752054, + "language_loss": 0.77501416, + "learning_rate": 3.912105223798025e-06, + "loss": 0.79782522, + "num_input_tokens_seen": 21388410, + "step": 1014, + "time_per_iteration": 2.7797036170959473 + }, + { + "auxiliary_loss_clip": 0.01180468, + "auxiliary_loss_mlp": 0.01081938, + "balance_loss_clip": 1.07544875, + "balance_loss_mlp": 1.00144792, + "epoch": 0.12204653399867733, + "flos": 47725354085760.0, + "grad_norm": 0.9919675201810211, + "language_loss": 0.6759789, + "learning_rate": 3.9118766880527065e-06, + "loss": 0.69860291, + "num_input_tokens_seen": 21442845, + "step": 1015, + "time_per_iteration": 3.2118303775787354 + }, + { + "auxiliary_loss_clip": 0.01156185, + "auxiliary_loss_mlp": 0.0109379, + "balance_loss_clip": 1.04430842, + "balance_loss_mlp": 1.0099622, + "epoch": 0.12216677688931642, + "flos": 18221936584320.0, + "grad_norm": 1.6786101276609673, + "language_loss": 0.73686719, + "learning_rate": 3.9116478622772145e-06, + "loss": 0.75936699, + "num_input_tokens_seen": 21461420, + "step": 1016, + "time_per_iteration": 2.834740400314331 + }, + { + "auxiliary_loss_clip": 0.01189999, + "auxiliary_loss_mlp": 0.01094068, + "balance_loss_clip": 1.0515672, + "balance_loss_mlp": 1.01014435, + "epoch": 0.12228701977995551, + "flos": 27525636789120.0, + "grad_norm": 1.8493548903905255, + "language_loss": 0.88321543, + "learning_rate": 3.911418746506261e-06, + "loss": 0.90605617, + "num_input_tokens_seen": 21481550, + "step": 1017, + "time_per_iteration": 2.8372092247009277 + }, + { + "auxiliary_loss_clip": 0.01192336, + "auxiliary_loss_mlp": 0.01095461, + "balance_loss_clip": 1.05333018, + "balance_loss_mlp": 1.01158512, + "epoch": 0.1224072626705946, + "flos": 21798136517760.0, + "grad_norm": 1.9497387639697634, + "language_loss": 0.78751558, + "learning_rate": 3.911189340774604e-06, + "loss": 0.81039357, + "num_input_tokens_seen": 21501680, + "step": 1018, + "time_per_iteration": 2.7233810424804688 + }, + { + "auxiliary_loss_clip": 0.01186557, + "auxiliary_loss_mlp": 0.01092454, + "balance_loss_clip": 1.05374098, + "balance_loss_mlp": 1.00876892, + "epoch": 0.1225275055612337, + "flos": 20703758895360.0, + "grad_norm": 1.769720136778473, + "language_loss": 0.79842472, + "learning_rate": 3.910959645117043e-06, + "loss": 0.8212148, + "num_input_tokens_seen": 21521015, + "step": 1019, + "time_per_iteration": 2.8279290199279785 + }, + { + "auxiliary_loss_clip": 0.01192308, + "auxiliary_loss_mlp": 0.00874117, + "balance_loss_clip": 1.07270169, + "balance_loss_mlp": 0.99982148, + "epoch": 0.12264774845187278, + "flos": 57745294462080.0, + "grad_norm": 0.8210291034265658, + "language_loss": 0.56704533, + "learning_rate": 3.910729659568423e-06, + "loss": 0.58770961, + "num_input_tokens_seen": 21578200, + "step": 1020, + "time_per_iteration": 3.3063957691192627 + }, + { + "auxiliary_loss_clip": 0.01183685, + "auxiliary_loss_mlp": 0.01098024, + "balance_loss_clip": 1.05422997, + "balance_loss_mlp": 1.01438713, + "epoch": 0.12276799134251187, + "flos": 26396282298240.0, + "grad_norm": 1.6580858435151133, + "language_loss": 0.81903684, + "learning_rate": 3.9104993841636344e-06, + "loss": 0.84185392, + "num_input_tokens_seen": 21598770, + "step": 1021, + "time_per_iteration": 2.8192198276519775 + }, + { + "auxiliary_loss_clip": 0.01177603, + "auxiliary_loss_mlp": 0.00875094, + "balance_loss_clip": 1.0502789, + "balance_loss_mlp": 1.00014639, + "epoch": 0.12288823423315097, + "flos": 21064193919360.0, + "grad_norm": 1.759264376611251, + "language_loss": 0.80892909, + "learning_rate": 3.910268818937608e-06, + "loss": 0.82945609, + "num_input_tokens_seen": 21616925, + "step": 1022, + "time_per_iteration": 2.779712438583374 + }, + { + "auxiliary_loss_clip": 0.0116189, + "auxiliary_loss_mlp": 0.01096225, + "balance_loss_clip": 1.05062759, + "balance_loss_mlp": 1.01230192, + "epoch": 0.12300847712379005, + "flos": 12312441077760.0, + "grad_norm": 2.3720025920288923, + "language_loss": 0.87698948, + "learning_rate": 3.9100379639253196e-06, + "loss": 0.89957064, + "num_input_tokens_seen": 21633645, + "step": 1023, + "time_per_iteration": 2.796964168548584 + }, + { + "auxiliary_loss_clip": 0.01176458, + "auxiliary_loss_mlp": 0.01092182, + "balance_loss_clip": 1.05118036, + "balance_loss_mlp": 1.0082587, + "epoch": 0.12312872001442915, + "flos": 16762239688320.0, + "grad_norm": 2.5590379476244776, + "language_loss": 0.86684984, + "learning_rate": 3.909806819161791e-06, + "loss": 0.88953626, + "num_input_tokens_seen": 21649120, + "step": 1024, + "time_per_iteration": 2.7511656284332275 + }, + { + "auxiliary_loss_clip": 0.01164375, + "auxiliary_loss_mlp": 0.01094379, + "balance_loss_clip": 1.04265785, + "balance_loss_mlp": 1.01026535, + "epoch": 0.12324896290506823, + "flos": 18404937400320.0, + "grad_norm": 4.3307734550279395, + "language_loss": 0.86389899, + "learning_rate": 3.909575384682086e-06, + "loss": 0.88648653, + "num_input_tokens_seen": 21668000, + "step": 1025, + "time_per_iteration": 2.765326738357544 + }, + { + "auxiliary_loss_clip": 0.01194437, + "auxiliary_loss_mlp": 0.01093884, + "balance_loss_clip": 1.05403256, + "balance_loss_mlp": 1.01000786, + "epoch": 0.12336920579570733, + "flos": 18915407533440.0, + "grad_norm": 2.137143057176349, + "language_loss": 0.6942451, + "learning_rate": 3.9093436605213144e-06, + "loss": 0.71712834, + "num_input_tokens_seen": 21688500, + "step": 1026, + "time_per_iteration": 2.7642955780029297 + }, + { + "auxiliary_loss_clip": 0.01179549, + "auxiliary_loss_mlp": 0.01092838, + "balance_loss_clip": 1.05019379, + "balance_loss_mlp": 1.00896215, + "epoch": 0.12348944868634643, + "flos": 23878369797120.0, + "grad_norm": 1.79498988130486, + "language_loss": 0.7929424, + "learning_rate": 3.909111646714627e-06, + "loss": 0.81566626, + "num_input_tokens_seen": 21709345, + "step": 1027, + "time_per_iteration": 2.787384510040283 + }, + { + "auxiliary_loss_clip": 0.01201433, + "auxiliary_loss_mlp": 0.01092773, + "balance_loss_clip": 1.05350733, + "balance_loss_mlp": 1.00923121, + "epoch": 0.12360969157698551, + "flos": 19026084314880.0, + "grad_norm": 2.0001924771829636, + "language_loss": 0.72391915, + "learning_rate": 3.9088793432972206e-06, + "loss": 0.74686122, + "num_input_tokens_seen": 21728165, + "step": 1028, + "time_per_iteration": 2.7947661876678467 + }, + { + "auxiliary_loss_clip": 0.01156815, + "auxiliary_loss_mlp": 0.01094097, + "balance_loss_clip": 1.04602385, + "balance_loss_mlp": 1.01055551, + "epoch": 0.1237299344676246, + "flos": 13224607983360.0, + "grad_norm": 2.074748980299056, + "language_loss": 0.82149059, + "learning_rate": 3.908646750304336e-06, + "loss": 0.84399974, + "num_input_tokens_seen": 21745850, + "step": 1029, + "time_per_iteration": 2.773204803466797 + }, + { + "auxiliary_loss_clip": 0.01176688, + "auxiliary_loss_mlp": 0.01094743, + "balance_loss_clip": 1.04749227, + "balance_loss_mlp": 1.01105833, + "epoch": 0.12385017735826369, + "flos": 20485673470080.0, + "grad_norm": 1.5652187292786641, + "language_loss": 0.87368649, + "learning_rate": 3.908413867771257e-06, + "loss": 0.89640081, + "num_input_tokens_seen": 21764760, + "step": 1030, + "time_per_iteration": 2.7681918144226074 + }, + { + "auxiliary_loss_clip": 0.0118902, + "auxiliary_loss_mlp": 0.01094861, + "balance_loss_clip": 1.05076337, + "balance_loss_mlp": 1.01098573, + "epoch": 0.12397042024890279, + "flos": 17347835116800.0, + "grad_norm": 1.6935401983045735, + "language_loss": 0.80937207, + "learning_rate": 3.908180695733311e-06, + "loss": 0.8322109, + "num_input_tokens_seen": 21784250, + "step": 1031, + "time_per_iteration": 2.729254722595215 + }, + { + "auxiliary_loss_clip": 0.01150174, + "auxiliary_loss_mlp": 0.0109264, + "balance_loss_clip": 1.04560018, + "balance_loss_mlp": 1.00905013, + "epoch": 0.12409066313954187, + "flos": 20412343854720.0, + "grad_norm": 1.9075693006373793, + "language_loss": 0.82783133, + "learning_rate": 3.907947234225871e-06, + "loss": 0.85025948, + "num_input_tokens_seen": 21803260, + "step": 1032, + "time_per_iteration": 2.8703055381774902 + }, + { + "auxiliary_loss_clip": 0.01146098, + "auxiliary_loss_mlp": 0.01094077, + "balance_loss_clip": 1.04521751, + "balance_loss_mlp": 1.01067829, + "epoch": 0.12421090603018096, + "flos": 20736688688640.0, + "grad_norm": 1.8714992848210947, + "language_loss": 0.87475002, + "learning_rate": 3.907713483284352e-06, + "loss": 0.89715171, + "num_input_tokens_seen": 21822735, + "step": 1033, + "time_per_iteration": 3.860045909881592 + }, + { + "auxiliary_loss_clip": 0.01133413, + "auxiliary_loss_mlp": 0.01094719, + "balance_loss_clip": 1.04120374, + "balance_loss_mlp": 1.0109868, + "epoch": 0.12433114892082006, + "flos": 24498834353280.0, + "grad_norm": 2.6257083686073996, + "language_loss": 0.97383898, + "learning_rate": 3.907479442944216e-06, + "loss": 0.99612033, + "num_input_tokens_seen": 21841140, + "step": 1034, + "time_per_iteration": 2.9533634185791016 + }, + { + "auxiliary_loss_clip": 0.01192331, + "auxiliary_loss_mlp": 0.01092144, + "balance_loss_clip": 1.05422783, + "balance_loss_mlp": 1.00836349, + "epoch": 0.12445139181145914, + "flos": 19682315838720.0, + "grad_norm": 2.028824374043919, + "language_loss": 0.92615724, + "learning_rate": 3.907245113240963e-06, + "loss": 0.94900203, + "num_input_tokens_seen": 21859260, + "step": 1035, + "time_per_iteration": 3.6766085624694824 + }, + { + "auxiliary_loss_clip": 0.01172753, + "auxiliary_loss_mlp": 0.01091518, + "balance_loss_clip": 1.04981995, + "balance_loss_mlp": 1.0077374, + "epoch": 0.12457163470209824, + "flos": 46423087522560.0, + "grad_norm": 1.8299283260086248, + "language_loss": 0.73713678, + "learning_rate": 3.907010494210144e-06, + "loss": 0.75977951, + "num_input_tokens_seen": 21881920, + "step": 1036, + "time_per_iteration": 3.0238184928894043 + }, + { + "auxiliary_loss_clip": 0.01191858, + "auxiliary_loss_mlp": 0.01093786, + "balance_loss_clip": 1.05268121, + "balance_loss_mlp": 1.01000524, + "epoch": 0.12469187759273732, + "flos": 20376289578240.0, + "grad_norm": 2.0824878891943777, + "language_loss": 0.91855228, + "learning_rate": 3.9067755858873495e-06, + "loss": 0.94140875, + "num_input_tokens_seen": 21898720, + "step": 1037, + "time_per_iteration": 3.6132373809814453 + }, + { + "auxiliary_loss_clip": 0.01148204, + "auxiliary_loss_mlp": 0.01080952, + "balance_loss_clip": 1.05188799, + "balance_loss_mlp": 1.00046206, + "epoch": 0.12481212048337642, + "flos": 69224641447680.0, + "grad_norm": 0.8685269025038239, + "language_loss": 0.62828398, + "learning_rate": 3.906540388308214e-06, + "loss": 0.65057552, + "num_input_tokens_seen": 21958305, + "step": 1038, + "time_per_iteration": 3.3258605003356934 + }, + { + "auxiliary_loss_clip": 0.01152611, + "auxiliary_loss_mlp": 0.01094598, + "balance_loss_clip": 1.04943109, + "balance_loss_mlp": 1.01086593, + "epoch": 0.12493236337401552, + "flos": 18223696350720.0, + "grad_norm": 7.758324537562088, + "language_loss": 0.81365681, + "learning_rate": 3.906304901508417e-06, + "loss": 0.83612883, + "num_input_tokens_seen": 21977205, + "step": 1039, + "time_per_iteration": 2.8394522666931152 + }, + { + "auxiliary_loss_clip": 0.01193705, + "auxiliary_loss_mlp": 0.01092766, + "balance_loss_clip": 1.05444777, + "balance_loss_mlp": 1.00927234, + "epoch": 0.12505260626465461, + "flos": 30044375303040.0, + "grad_norm": 2.0218680524646246, + "language_loss": 0.75527442, + "learning_rate": 3.9060691255236835e-06, + "loss": 0.77813911, + "num_input_tokens_seen": 21997770, + "step": 1040, + "time_per_iteration": 2.7506234645843506 + }, + { + "auxiliary_loss_clip": 0.01183706, + "auxiliary_loss_mlp": 0.01093028, + "balance_loss_clip": 1.05160546, + "balance_loss_mlp": 1.00939071, + "epoch": 0.1251728491552937, + "flos": 24433980347520.0, + "grad_norm": 1.6203899484205706, + "language_loss": 0.80435222, + "learning_rate": 3.905833060389778e-06, + "loss": 0.82711953, + "num_input_tokens_seen": 22021890, + "step": 1041, + "time_per_iteration": 2.8443241119384766 + }, + { + "auxiliary_loss_clip": 0.01202739, + "auxiliary_loss_mlp": 0.00874956, + "balance_loss_clip": 1.0546844, + "balance_loss_mlp": 1.00003111, + "epoch": 0.12529309204593278, + "flos": 27119809952640.0, + "grad_norm": 2.879569240455281, + "language_loss": 0.78281343, + "learning_rate": 3.905596706142513e-06, + "loss": 0.80359042, + "num_input_tokens_seen": 22043300, + "step": 1042, + "time_per_iteration": 2.7167489528656006 + }, + { + "auxiliary_loss_clip": 0.01171059, + "auxiliary_loss_mlp": 0.01093987, + "balance_loss_clip": 1.04941344, + "balance_loss_mlp": 1.01044512, + "epoch": 0.12541333493657186, + "flos": 30774151923840.0, + "grad_norm": 4.27650214572293, + "language_loss": 0.86252558, + "learning_rate": 3.9053600628177435e-06, + "loss": 0.88517606, + "num_input_tokens_seen": 22062910, + "step": 1043, + "time_per_iteration": 2.832458019256592 + }, + { + "auxiliary_loss_clip": 0.01200156, + "auxiliary_loss_mlp": 0.01091249, + "balance_loss_clip": 1.05223584, + "balance_loss_mlp": 1.00765991, + "epoch": 0.12553357782721097, + "flos": 23659566099840.0, + "grad_norm": 2.178070838530362, + "language_loss": 0.84592497, + "learning_rate": 3.905123130451367e-06, + "loss": 0.86883903, + "num_input_tokens_seen": 22084010, + "step": 1044, + "time_per_iteration": 2.681159734725952 + }, + { + "auxiliary_loss_clip": 0.01201638, + "auxiliary_loss_mlp": 0.01091412, + "balance_loss_clip": 1.05395985, + "balance_loss_mlp": 1.00782275, + "epoch": 0.12565382071785006, + "flos": 24863758577280.0, + "grad_norm": 2.1724560580250887, + "language_loss": 0.79326272, + "learning_rate": 3.904885909079326e-06, + "loss": 0.81619322, + "num_input_tokens_seen": 22102795, + "step": 1045, + "time_per_iteration": 2.709096670150757 + }, + { + "auxiliary_loss_clip": 0.01187524, + "auxiliary_loss_mlp": 0.01091639, + "balance_loss_clip": 1.04982948, + "balance_loss_mlp": 1.00785875, + "epoch": 0.12577406360848914, + "flos": 21360780518400.0, + "grad_norm": 2.564962226514369, + "language_loss": 0.77684003, + "learning_rate": 3.904648398737607e-06, + "loss": 0.79963171, + "num_input_tokens_seen": 22121360, + "step": 1046, + "time_per_iteration": 2.7732810974121094 + }, + { + "auxiliary_loss_clip": 0.01200906, + "auxiliary_loss_mlp": 0.0109341, + "balance_loss_clip": 1.05307984, + "balance_loss_mlp": 1.01015437, + "epoch": 0.12589430649912825, + "flos": 36138056774400.0, + "grad_norm": 1.8210050334950454, + "language_loss": 0.77892786, + "learning_rate": 3.9044105994622406e-06, + "loss": 0.801871, + "num_input_tokens_seen": 22142505, + "step": 1047, + "time_per_iteration": 2.7627642154693604 + }, + { + "auxiliary_loss_clip": 0.01181358, + "auxiliary_loss_mlp": 0.00874979, + "balance_loss_clip": 1.05086327, + "balance_loss_mlp": 1.00001752, + "epoch": 0.12601454938976733, + "flos": 25337671643520.0, + "grad_norm": 1.7827312626063911, + "language_loss": 0.81650925, + "learning_rate": 3.9041725112893005e-06, + "loss": 0.83707273, + "num_input_tokens_seen": 22163730, + "step": 1048, + "time_per_iteration": 2.8340063095092773 + }, + { + "auxiliary_loss_clip": 0.01160766, + "auxiliary_loss_mlp": 0.01090878, + "balance_loss_clip": 1.04875946, + "balance_loss_mlp": 1.00719309, + "epoch": 0.12613479228040642, + "flos": 15560094286080.0, + "grad_norm": 3.759627781704481, + "language_loss": 0.75600004, + "learning_rate": 3.903934134254904e-06, + "loss": 0.77851641, + "num_input_tokens_seen": 22181520, + "step": 1049, + "time_per_iteration": 2.719611644744873 + }, + { + "auxiliary_loss_clip": 0.01192519, + "auxiliary_loss_mlp": 0.01093215, + "balance_loss_clip": 1.05296028, + "balance_loss_mlp": 1.0094347, + "epoch": 0.1262550351710455, + "flos": 21470595373440.0, + "grad_norm": 2.2526222580824893, + "language_loss": 0.84648377, + "learning_rate": 3.903695468395213e-06, + "loss": 0.86934114, + "num_input_tokens_seen": 22199390, + "step": 1050, + "time_per_iteration": 2.7323379516601562 + }, + { + "auxiliary_loss_clip": 0.01183614, + "auxiliary_loss_mlp": 0.01093075, + "balance_loss_clip": 1.05131388, + "balance_loss_mlp": 1.00915134, + "epoch": 0.1263752780616846, + "flos": 31576719456000.0, + "grad_norm": 3.6922642797386716, + "language_loss": 0.55438119, + "learning_rate": 3.903456513746434e-06, + "loss": 0.57714808, + "num_input_tokens_seen": 22220365, + "step": 1051, + "time_per_iteration": 2.78397536277771 + }, + { + "auxiliary_loss_clip": 0.0120172, + "auxiliary_loss_mlp": 0.01094203, + "balance_loss_clip": 1.05403125, + "balance_loss_mlp": 1.01066113, + "epoch": 0.1264955209523237, + "flos": 28768217927040.0, + "grad_norm": 1.7546864974221612, + "language_loss": 0.87339783, + "learning_rate": 3.903217270344815e-06, + "loss": 0.89635706, + "num_input_tokens_seen": 22240615, + "step": 1052, + "time_per_iteration": 2.7023727893829346 + }, + { + "auxiliary_loss_clip": 0.01162494, + "auxiliary_loss_mlp": 0.01095149, + "balance_loss_clip": 1.04899085, + "balance_loss_mlp": 1.01151168, + "epoch": 0.12661576384296278, + "flos": 29241125412480.0, + "grad_norm": 4.006678335187508, + "language_loss": 0.82691693, + "learning_rate": 3.902977738226648e-06, + "loss": 0.84949338, + "num_input_tokens_seen": 22261350, + "step": 1053, + "time_per_iteration": 2.8490216732025146 + }, + { + "auxiliary_loss_clip": 0.01189971, + "auxiliary_loss_mlp": 0.01096585, + "balance_loss_clip": 1.05137336, + "balance_loss_mlp": 1.01251876, + "epoch": 0.12673600673360189, + "flos": 20850346298880.0, + "grad_norm": 3.1539770501381525, + "language_loss": 0.90681785, + "learning_rate": 3.902737917428273e-06, + "loss": 0.92968339, + "num_input_tokens_seen": 22279515, + "step": 1054, + "time_per_iteration": 2.696162462234497 + }, + { + "auxiliary_loss_clip": 0.01199723, + "auxiliary_loss_mlp": 0.01092118, + "balance_loss_clip": 1.05189955, + "balance_loss_mlp": 1.00843322, + "epoch": 0.12685624962424097, + "flos": 25263695583360.0, + "grad_norm": 1.5611303072656166, + "language_loss": 0.8389529, + "learning_rate": 3.902497807986068e-06, + "loss": 0.8618713, + "num_input_tokens_seen": 22299535, + "step": 1055, + "time_per_iteration": 2.714580535888672 + }, + { + "auxiliary_loss_clip": 0.01170213, + "auxiliary_loss_mlp": 0.01092409, + "balance_loss_clip": 1.0483067, + "balance_loss_mlp": 1.00834262, + "epoch": 0.12697649251488005, + "flos": 27527109246720.0, + "grad_norm": 14.75117790668742, + "language_loss": 0.83903337, + "learning_rate": 3.902257409936458e-06, + "loss": 0.86165959, + "num_input_tokens_seen": 22320300, + "step": 1056, + "time_per_iteration": 2.81815767288208 + }, + { + "auxiliary_loss_clip": 0.01176473, + "auxiliary_loss_mlp": 0.01095624, + "balance_loss_clip": 1.04763842, + "balance_loss_mlp": 1.01222563, + "epoch": 0.12709673540551916, + "flos": 21251863503360.0, + "grad_norm": 1.9504637038918569, + "language_loss": 0.84175986, + "learning_rate": 3.902016723315912e-06, + "loss": 0.86448085, + "num_input_tokens_seen": 22338240, + "step": 1057, + "time_per_iteration": 2.741551637649536 + }, + { + "auxiliary_loss_clip": 0.01184432, + "auxiliary_loss_mlp": 0.01092241, + "balance_loss_clip": 1.05138516, + "balance_loss_mlp": 1.0086993, + "epoch": 0.12721697829615825, + "flos": 25337707557120.0, + "grad_norm": 2.1738104291626996, + "language_loss": 0.69274086, + "learning_rate": 3.901775748160941e-06, + "loss": 0.71550763, + "num_input_tokens_seen": 22357420, + "step": 1058, + "time_per_iteration": 3.674431324005127 + }, + { + "auxiliary_loss_clip": 0.01179217, + "auxiliary_loss_mlp": 0.01081825, + "balance_loss_clip": 1.06978655, + "balance_loss_mlp": 1.00133502, + "epoch": 0.12733722118679733, + "flos": 61943287754880.0, + "grad_norm": 0.790830527555561, + "language_loss": 0.60868162, + "learning_rate": 3.901534484508101e-06, + "loss": 0.63129205, + "num_input_tokens_seen": 22420095, + "step": 1059, + "time_per_iteration": 4.241262912750244 + }, + { + "auxiliary_loss_clip": 0.01181756, + "auxiliary_loss_mlp": 0.01091837, + "balance_loss_clip": 1.05180645, + "balance_loss_mlp": 1.00829506, + "epoch": 0.1274574640774364, + "flos": 26976742081920.0, + "grad_norm": 1.7991061323006632, + "language_loss": 0.74737835, + "learning_rate": 3.901292932393991e-06, + "loss": 0.77011424, + "num_input_tokens_seen": 22438975, + "step": 1060, + "time_per_iteration": 2.791846990585327 + }, + { + "auxiliary_loss_clip": 0.01202506, + "auxiliary_loss_mlp": 0.0109418, + "balance_loss_clip": 1.05429435, + "balance_loss_mlp": 1.01054311, + "epoch": 0.12757770696807552, + "flos": 22236318529920.0, + "grad_norm": 2.7383904485040085, + "language_loss": 0.85448241, + "learning_rate": 3.9010510918552555e-06, + "loss": 0.87744927, + "num_input_tokens_seen": 22458050, + "step": 1061, + "time_per_iteration": 3.549980878829956 + }, + { + "auxiliary_loss_clip": 0.0118368, + "auxiliary_loss_mlp": 0.01096996, + "balance_loss_clip": 1.05151904, + "balance_loss_mlp": 1.01316762, + "epoch": 0.1276979498587146, + "flos": 28547905858560.0, + "grad_norm": 2.9492435112241457, + "language_loss": 0.74679941, + "learning_rate": 3.900808962928581e-06, + "loss": 0.76960617, + "num_input_tokens_seen": 22475665, + "step": 1062, + "time_per_iteration": 3.5068235397338867 + }, + { + "auxiliary_loss_clip": 0.01201303, + "auxiliary_loss_mlp": 0.01092763, + "balance_loss_clip": 1.05329609, + "balance_loss_mlp": 1.00936401, + "epoch": 0.1278181927493537, + "flos": 17420338719360.0, + "grad_norm": 2.0134231506002593, + "language_loss": 0.8912921, + "learning_rate": 3.900566545650698e-06, + "loss": 0.91423273, + "num_input_tokens_seen": 22493335, + "step": 1063, + "time_per_iteration": 2.586836338043213 + }, + { + "auxiliary_loss_clip": 0.01189194, + "auxiliary_loss_mlp": 0.01095924, + "balance_loss_clip": 1.05186772, + "balance_loss_mlp": 1.0120486, + "epoch": 0.1279384356399928, + "flos": 21138636856320.0, + "grad_norm": 2.3194125693671768, + "language_loss": 0.81412047, + "learning_rate": 3.900323840058381e-06, + "loss": 0.83697164, + "num_input_tokens_seen": 22511045, + "step": 1064, + "time_per_iteration": 2.613250255584717 + }, + { + "auxiliary_loss_clip": 0.01193321, + "auxiliary_loss_mlp": 0.01095714, + "balance_loss_clip": 1.05326545, + "balance_loss_mlp": 1.01207709, + "epoch": 0.12805867853063188, + "flos": 26576733248640.0, + "grad_norm": 2.1833286173914535, + "language_loss": 0.81703806, + "learning_rate": 3.900080846188449e-06, + "loss": 0.83992839, + "num_input_tokens_seen": 22529635, + "step": 1065, + "time_per_iteration": 2.7265782356262207 + }, + { + "auxiliary_loss_clip": 0.01201944, + "auxiliary_loss_mlp": 0.01093392, + "balance_loss_clip": 1.05374777, + "balance_loss_mlp": 1.00970685, + "epoch": 0.12817892142127096, + "flos": 16436206915200.0, + "grad_norm": 1.9267565168919434, + "language_loss": 0.81551373, + "learning_rate": 3.8998375640777625e-06, + "loss": 0.83846712, + "num_input_tokens_seen": 22547505, + "step": 1066, + "time_per_iteration": 2.5949034690856934 + }, + { + "auxiliary_loss_clip": 0.01162554, + "auxiliary_loss_mlp": 0.01080727, + "balance_loss_clip": 1.05252993, + "balance_loss_mlp": 1.00023711, + "epoch": 0.12829916431191005, + "flos": 60757049099520.0, + "grad_norm": 0.7048869138284423, + "language_loss": 0.5265435, + "learning_rate": 3.899593993763229e-06, + "loss": 0.5489763, + "num_input_tokens_seen": 22608465, + "step": 1067, + "time_per_iteration": 3.2143614292144775 + }, + { + "auxiliary_loss_clip": 0.01170499, + "auxiliary_loss_mlp": 0.01096586, + "balance_loss_clip": 1.05022156, + "balance_loss_mlp": 1.01261449, + "epoch": 0.12841940720254916, + "flos": 29786895636480.0, + "grad_norm": 2.179373367394756, + "language_loss": 0.81146705, + "learning_rate": 3.899350135281796e-06, + "loss": 0.83413786, + "num_input_tokens_seen": 22629465, + "step": 1068, + "time_per_iteration": 2.8342342376708984 + }, + { + "auxiliary_loss_clip": 0.01164464, + "auxiliary_loss_mlp": 0.01094688, + "balance_loss_clip": 1.04373646, + "balance_loss_mlp": 1.01124156, + "epoch": 0.12853965009318824, + "flos": 25951851319680.0, + "grad_norm": 2.0012954062314208, + "language_loss": 0.79557252, + "learning_rate": 3.8991059886704585e-06, + "loss": 0.81816411, + "num_input_tokens_seen": 22648970, + "step": 1069, + "time_per_iteration": 2.7907590866088867 + }, + { + "auxiliary_loss_clip": 0.01163079, + "auxiliary_loss_mlp": 0.01093999, + "balance_loss_clip": 1.04514694, + "balance_loss_mlp": 1.01031399, + "epoch": 0.12865989298382732, + "flos": 30846871008000.0, + "grad_norm": 2.147226707271435, + "language_loss": 0.83252907, + "learning_rate": 3.898861553966252e-06, + "loss": 0.8550998, + "num_input_tokens_seen": 22668620, + "step": 1070, + "time_per_iteration": 2.8260109424591064 + }, + { + "auxiliary_loss_clip": 0.01137453, + "auxiliary_loss_mlp": 0.01092631, + "balance_loss_clip": 1.04316986, + "balance_loss_mlp": 1.00913668, + "epoch": 0.12878013587446643, + "flos": 25885776251520.0, + "grad_norm": 1.7557133326352456, + "language_loss": 0.87766969, + "learning_rate": 3.898616831206257e-06, + "loss": 0.89997053, + "num_input_tokens_seen": 22689045, + "step": 1071, + "time_per_iteration": 2.8818604946136475 + }, + { + "auxiliary_loss_clip": 0.01170096, + "auxiliary_loss_mlp": 0.01092712, + "balance_loss_clip": 1.04845953, + "balance_loss_mlp": 1.00907469, + "epoch": 0.12890037876510552, + "flos": 23333138277120.0, + "grad_norm": 2.5194262113312025, + "language_loss": 0.76826799, + "learning_rate": 3.8983718204276e-06, + "loss": 0.79089606, + "num_input_tokens_seen": 22711265, + "step": 1072, + "time_per_iteration": 2.7897908687591553 + }, + { + "auxiliary_loss_clip": 0.01175303, + "auxiliary_loss_mlp": 0.01095023, + "balance_loss_clip": 1.05144143, + "balance_loss_mlp": 1.01162469, + "epoch": 0.1290206216557446, + "flos": 23587242065280.0, + "grad_norm": 1.7345610163525063, + "language_loss": 0.82303286, + "learning_rate": 3.898126521667446e-06, + "loss": 0.84573609, + "num_input_tokens_seen": 22731420, + "step": 1073, + "time_per_iteration": 2.7080650329589844 + }, + { + "auxiliary_loss_clip": 0.01184161, + "auxiliary_loss_mlp": 0.01093528, + "balance_loss_clip": 1.05127811, + "balance_loss_mlp": 1.01008153, + "epoch": 0.12914086454638368, + "flos": 24170610850560.0, + "grad_norm": 2.1048478978979333, + "language_loss": 0.82936323, + "learning_rate": 3.897880934963007e-06, + "loss": 0.85214019, + "num_input_tokens_seen": 22750970, + "step": 1074, + "time_per_iteration": 2.9226155281066895 + }, + { + "auxiliary_loss_clip": 0.0118081, + "auxiliary_loss_mlp": 0.01092596, + "balance_loss_clip": 1.04923081, + "balance_loss_mlp": 1.00914907, + "epoch": 0.1292611074370228, + "flos": 20267157081600.0, + "grad_norm": 2.1979078693978056, + "language_loss": 0.78402561, + "learning_rate": 3.89763506035154e-06, + "loss": 0.80675966, + "num_input_tokens_seen": 22768820, + "step": 1075, + "time_per_iteration": 2.7650701999664307 + }, + { + "auxiliary_loss_clip": 0.01184218, + "auxiliary_loss_mlp": 0.01093558, + "balance_loss_clip": 1.05244875, + "balance_loss_mlp": 1.0099206, + "epoch": 0.12938135032766188, + "flos": 27377684668800.0, + "grad_norm": 1.6851960682548763, + "language_loss": 0.81137192, + "learning_rate": 3.897388897870343e-06, + "loss": 0.83414972, + "num_input_tokens_seen": 22789460, + "step": 1076, + "time_per_iteration": 2.7459919452667236 + }, + { + "auxiliary_loss_clip": 0.01182818, + "auxiliary_loss_mlp": 0.01091033, + "balance_loss_clip": 1.05099475, + "balance_loss_mlp": 1.00725245, + "epoch": 0.12950159321830096, + "flos": 29277107861760.0, + "grad_norm": 2.15877382487411, + "language_loss": 0.75167, + "learning_rate": 3.89714244755676e-06, + "loss": 0.77440846, + "num_input_tokens_seen": 22810820, + "step": 1077, + "time_per_iteration": 2.8124635219573975 + }, + { + "auxiliary_loss_clip": 0.01153409, + "auxiliary_loss_mlp": 0.01092789, + "balance_loss_clip": 1.04727721, + "balance_loss_mlp": 1.00900912, + "epoch": 0.12962183610894007, + "flos": 24534888629760.0, + "grad_norm": 2.680962951387634, + "language_loss": 0.86155105, + "learning_rate": 3.896895709448175e-06, + "loss": 0.88401306, + "num_input_tokens_seen": 22830570, + "step": 1078, + "time_per_iteration": 2.8318564891815186 + }, + { + "auxiliary_loss_clip": 0.01127787, + "auxiliary_loss_mlp": 0.01090637, + "balance_loss_clip": 1.03637576, + "balance_loss_mlp": 1.00723791, + "epoch": 0.12974207899957915, + "flos": 11215944552960.0, + "grad_norm": 2.634418697159902, + "language_loss": 0.77206397, + "learning_rate": 3.896648683582019e-06, + "loss": 0.79424822, + "num_input_tokens_seen": 22845905, + "step": 1079, + "time_per_iteration": 2.828528642654419 + }, + { + "auxiliary_loss_clip": 0.01155067, + "auxiliary_loss_mlp": 0.01094393, + "balance_loss_clip": 1.04517984, + "balance_loss_mlp": 1.01099455, + "epoch": 0.12986232189021824, + "flos": 24717889445760.0, + "grad_norm": 1.899420267757882, + "language_loss": 0.80525464, + "learning_rate": 3.896401369995766e-06, + "loss": 0.82774925, + "num_input_tokens_seen": 22865710, + "step": 1080, + "time_per_iteration": 2.8247342109680176 + }, + { + "auxiliary_loss_clip": 0.01201685, + "auxiliary_loss_mlp": 0.01093655, + "balance_loss_clip": 1.05424833, + "balance_loss_mlp": 1.01025581, + "epoch": 0.12998256478085732, + "flos": 23915357827200.0, + "grad_norm": 1.783944449833319, + "language_loss": 0.79361248, + "learning_rate": 3.896153768726932e-06, + "loss": 0.81656581, + "num_input_tokens_seen": 22886020, + "step": 1081, + "time_per_iteration": 2.741752862930298 + }, + { + "auxiliary_loss_clip": 0.01190019, + "auxiliary_loss_mlp": 0.01092485, + "balance_loss_clip": 1.05294132, + "balance_loss_mlp": 1.00889516, + "epoch": 0.13010280767149643, + "flos": 18624207974400.0, + "grad_norm": 2.3623172708198124, + "language_loss": 0.87900907, + "learning_rate": 3.8959058798130806e-06, + "loss": 0.90183413, + "num_input_tokens_seen": 22903995, + "step": 1082, + "time_per_iteration": 2.6543309688568115 + }, + { + "auxiliary_loss_clip": 0.011832, + "auxiliary_loss_mlp": 0.00874991, + "balance_loss_clip": 1.05160797, + "balance_loss_mlp": 1.00002813, + "epoch": 0.1302230505621355, + "flos": 22783992174720.0, + "grad_norm": 1.793417047269826, + "language_loss": 0.75195587, + "learning_rate": 3.895657703291814e-06, + "loss": 0.77253777, + "num_input_tokens_seen": 22924100, + "step": 1083, + "time_per_iteration": 2.776175022125244 + }, + { + "auxiliary_loss_clip": 0.01177538, + "auxiliary_loss_mlp": 0.01093361, + "balance_loss_clip": 1.04704034, + "balance_loss_mlp": 1.00958121, + "epoch": 0.1303432934527746, + "flos": 21323612920320.0, + "grad_norm": 3.413422098638391, + "language_loss": 0.8003729, + "learning_rate": 3.895409239200781e-06, + "loss": 0.82308197, + "num_input_tokens_seen": 22939985, + "step": 1084, + "time_per_iteration": 3.7480530738830566 + }, + { + "auxiliary_loss_clip": 0.01191187, + "auxiliary_loss_mlp": 0.01092927, + "balance_loss_clip": 1.05184197, + "balance_loss_mlp": 1.00909877, + "epoch": 0.1304635363434137, + "flos": 20922490765440.0, + "grad_norm": 2.3185011124054444, + "language_loss": 0.9129892, + "learning_rate": 3.895160487577673e-06, + "loss": 0.93583035, + "num_input_tokens_seen": 22957555, + "step": 1085, + "time_per_iteration": 3.7667627334594727 + }, + { + "auxiliary_loss_clip": 0.01196931, + "auxiliary_loss_mlp": 0.01080584, + "balance_loss_clip": 1.06995988, + "balance_loss_mlp": 1.00009346, + "epoch": 0.1305837792340528, + "flos": 63245659080960.0, + "grad_norm": 0.7844249584304449, + "language_loss": 0.60916793, + "learning_rate": 3.894911448460226e-06, + "loss": 0.63194305, + "num_input_tokens_seen": 23016870, + "step": 1086, + "time_per_iteration": 3.1285438537597656 + }, + { + "auxiliary_loss_clip": 0.01129399, + "auxiliary_loss_mlp": 0.01092959, + "balance_loss_clip": 1.04279137, + "balance_loss_mlp": 1.00927448, + "epoch": 0.13070402212469187, + "flos": 26428852955520.0, + "grad_norm": 2.290378654786392, + "language_loss": 0.72682023, + "learning_rate": 3.8946621218862195e-06, + "loss": 0.74904382, + "num_input_tokens_seen": 23037870, + "step": 1087, + "time_per_iteration": 4.025998592376709 + }, + { + "auxiliary_loss_clip": 0.01156693, + "auxiliary_loss_mlp": 0.01090434, + "balance_loss_clip": 1.04503345, + "balance_loss_mlp": 1.00679719, + "epoch": 0.13082426501533098, + "flos": 27673409341440.0, + "grad_norm": 1.832707286864066, + "language_loss": 0.88664794, + "learning_rate": 3.894412507893475e-06, + "loss": 0.90911925, + "num_input_tokens_seen": 23058150, + "step": 1088, + "time_per_iteration": 3.761821985244751 + }, + { + "auxiliary_loss_clip": 0.01160349, + "auxiliary_loss_mlp": 0.01092035, + "balance_loss_clip": 1.04771185, + "balance_loss_mlp": 1.00835037, + "epoch": 0.13094450790597006, + "flos": 24826770547200.0, + "grad_norm": 2.490456308078782, + "language_loss": 0.72015142, + "learning_rate": 3.894162606519859e-06, + "loss": 0.7426753, + "num_input_tokens_seen": 23077100, + "step": 1089, + "time_per_iteration": 2.8599390983581543 + }, + { + "auxiliary_loss_clip": 0.01149424, + "auxiliary_loss_mlp": 0.01090589, + "balance_loss_clip": 1.04420376, + "balance_loss_mlp": 1.0071429, + "epoch": 0.13106475079660915, + "flos": 19062605468160.0, + "grad_norm": 1.9007380105511182, + "language_loss": 0.76950562, + "learning_rate": 3.893912417803282e-06, + "loss": 0.79190576, + "num_input_tokens_seen": 23096815, + "step": 1090, + "time_per_iteration": 2.8396806716918945 + }, + { + "auxiliary_loss_clip": 0.01160993, + "auxiliary_loss_mlp": 0.01092978, + "balance_loss_clip": 1.04686892, + "balance_loss_mlp": 1.00948405, + "epoch": 0.13118499368724823, + "flos": 28913189218560.0, + "grad_norm": 1.9050660971247102, + "language_loss": 0.76985145, + "learning_rate": 3.8936619417816975e-06, + "loss": 0.79239118, + "num_input_tokens_seen": 23117145, + "step": 1091, + "time_per_iteration": 2.874260425567627 + }, + { + "auxiliary_loss_clip": 0.01159889, + "auxiliary_loss_mlp": 0.01093481, + "balance_loss_clip": 1.04636872, + "balance_loss_mlp": 1.00989175, + "epoch": 0.13130523657788734, + "flos": 14283398206080.0, + "grad_norm": 1.9676722632101031, + "language_loss": 0.71545982, + "learning_rate": 3.8934111784931015e-06, + "loss": 0.73799348, + "num_input_tokens_seen": 23134595, + "step": 1092, + "time_per_iteration": 2.8101282119750977 + }, + { + "auxiliary_loss_clip": 0.01188287, + "auxiliary_loss_mlp": 0.0108069, + "balance_loss_clip": 1.06884694, + "balance_loss_mlp": 1.00020027, + "epoch": 0.13142547946852642, + "flos": 70174155519360.0, + "grad_norm": 0.9146898239490636, + "language_loss": 0.5906477, + "learning_rate": 3.893160127975535e-06, + "loss": 0.61333752, + "num_input_tokens_seen": 23195285, + "step": 1093, + "time_per_iteration": 3.426823854446411 + }, + { + "auxiliary_loss_clip": 0.01162452, + "auxiliary_loss_mlp": 0.01094324, + "balance_loss_clip": 1.04870653, + "balance_loss_mlp": 1.01082993, + "epoch": 0.1315457223591655, + "flos": 45805998844800.0, + "grad_norm": 2.3513656797370674, + "language_loss": 0.81376863, + "learning_rate": 3.8929087902670826e-06, + "loss": 0.83633637, + "num_input_tokens_seen": 23216915, + "step": 1094, + "time_per_iteration": 3.0425760746002197 + }, + { + "auxiliary_loss_clip": 0.01195783, + "auxiliary_loss_mlp": 0.01080486, + "balance_loss_clip": 1.06800389, + "balance_loss_mlp": 0.99999619, + "epoch": 0.13166596524980462, + "flos": 62881165820160.0, + "grad_norm": 0.9278154875954349, + "language_loss": 0.60789812, + "learning_rate": 3.8926571654058715e-06, + "loss": 0.63066077, + "num_input_tokens_seen": 23273560, + "step": 1095, + "time_per_iteration": 3.185058355331421 + }, + { + "auxiliary_loss_clip": 0.01163801, + "auxiliary_loss_mlp": 0.01092914, + "balance_loss_clip": 1.04656637, + "balance_loss_mlp": 1.00942004, + "epoch": 0.1317862081404437, + "flos": 23586523793280.0, + "grad_norm": 2.52666709278902, + "language_loss": 0.7701062, + "learning_rate": 3.892405253430074e-06, + "loss": 0.79267335, + "num_input_tokens_seen": 23291080, + "step": 1096, + "time_per_iteration": 2.7733845710754395 + }, + { + "auxiliary_loss_clip": 0.01176372, + "auxiliary_loss_mlp": 0.00874973, + "balance_loss_clip": 1.04812932, + "balance_loss_mlp": 1.00009739, + "epoch": 0.13190645103108278, + "flos": 20260764460800.0, + "grad_norm": 2.211749903648547, + "language_loss": 0.82493877, + "learning_rate": 3.892153054377904e-06, + "loss": 0.84545225, + "num_input_tokens_seen": 23308485, + "step": 1097, + "time_per_iteration": 2.7679531574249268 + }, + { + "auxiliary_loss_clip": 0.01117535, + "auxiliary_loss_mlp": 0.01080756, + "balance_loss_clip": 1.04502249, + "balance_loss_mlp": 1.00064707, + "epoch": 0.13202669392172187, + "flos": 53455440136320.0, + "grad_norm": 0.9384704973011355, + "language_loss": 0.59429461, + "learning_rate": 3.891900568287619e-06, + "loss": 0.61627752, + "num_input_tokens_seen": 23360870, + "step": 1098, + "time_per_iteration": 3.2197048664093018 + }, + { + "auxiliary_loss_clip": 0.01166864, + "auxiliary_loss_mlp": 0.01093804, + "balance_loss_clip": 1.04674101, + "balance_loss_mlp": 1.00997591, + "epoch": 0.13214693681236098, + "flos": 15851293845120.0, + "grad_norm": 2.39617893463633, + "language_loss": 0.72050714, + "learning_rate": 3.891647795197523e-06, + "loss": 0.74311388, + "num_input_tokens_seen": 23376910, + "step": 1099, + "time_per_iteration": 2.743077278137207 + }, + { + "auxiliary_loss_clip": 0.01168299, + "auxiliary_loss_mlp": 0.01096354, + "balance_loss_clip": 1.0474472, + "balance_loss_mlp": 1.01262164, + "epoch": 0.13226717970300006, + "flos": 19353840940800.0, + "grad_norm": 2.33109161362446, + "language_loss": 0.69011569, + "learning_rate": 3.8913947351459605e-06, + "loss": 0.71276224, + "num_input_tokens_seen": 23394450, + "step": 1100, + "time_per_iteration": 2.831092119216919 + }, + { + "auxiliary_loss_clip": 0.01197869, + "auxiliary_loss_mlp": 0.01094011, + "balance_loss_clip": 1.05089295, + "balance_loss_mlp": 1.01061273, + "epoch": 0.13238742259363914, + "flos": 20698084546560.0, + "grad_norm": 2.0398700543856187, + "language_loss": 0.6760577, + "learning_rate": 3.89114138817132e-06, + "loss": 0.69897652, + "num_input_tokens_seen": 23411115, + "step": 1101, + "time_per_iteration": 2.6779797077178955 + }, + { + "auxiliary_loss_clip": 0.01188354, + "auxiliary_loss_mlp": 0.01092704, + "balance_loss_clip": 1.05153918, + "balance_loss_mlp": 1.00959158, + "epoch": 0.13250766548427825, + "flos": 21032449274880.0, + "grad_norm": 1.8171976468841227, + "language_loss": 0.84466517, + "learning_rate": 3.890887754312035e-06, + "loss": 0.86747575, + "num_input_tokens_seen": 23429360, + "step": 1102, + "time_per_iteration": 2.716188430786133 + }, + { + "auxiliary_loss_clip": 0.01162306, + "auxiliary_loss_mlp": 0.01093273, + "balance_loss_clip": 1.04572356, + "balance_loss_mlp": 1.00977921, + "epoch": 0.13262790837491734, + "flos": 22637871648000.0, + "grad_norm": 3.501495139984528, + "language_loss": 0.87538904, + "learning_rate": 3.890633833606581e-06, + "loss": 0.89794481, + "num_input_tokens_seen": 23449050, + "step": 1103, + "time_per_iteration": 2.7862348556518555 + }, + { + "auxiliary_loss_clip": 0.01188955, + "auxiliary_loss_mlp": 0.01091742, + "balance_loss_clip": 1.05286634, + "balance_loss_mlp": 1.00862885, + "epoch": 0.13274815126555642, + "flos": 19683141851520.0, + "grad_norm": 1.7873962944219552, + "language_loss": 0.69372791, + "learning_rate": 3.890379626093477e-06, + "loss": 0.71653485, + "num_input_tokens_seen": 23468800, + "step": 1104, + "time_per_iteration": 2.7173986434936523 + }, + { + "auxiliary_loss_clip": 0.01145575, + "auxiliary_loss_mlp": 0.01091873, + "balance_loss_clip": 1.04001474, + "balance_loss_mlp": 1.00809312, + "epoch": 0.1328683941561955, + "flos": 21317687176320.0, + "grad_norm": 2.0958533676337416, + "language_loss": 0.92858571, + "learning_rate": 3.890125131811287e-06, + "loss": 0.95096028, + "num_input_tokens_seen": 23486850, + "step": 1105, + "time_per_iteration": 2.78253173828125 + }, + { + "auxiliary_loss_clip": 0.0117131, + "auxiliary_loss_mlp": 0.01090484, + "balance_loss_clip": 1.04943466, + "balance_loss_mlp": 1.0072763, + "epoch": 0.1329886370468346, + "flos": 13699131580800.0, + "grad_norm": 2.0538117993465765, + "language_loss": 0.75191438, + "learning_rate": 3.889870350798618e-06, + "loss": 0.77453226, + "num_input_tokens_seen": 23504195, + "step": 1106, + "time_per_iteration": 2.817749500274658 + }, + { + "auxiliary_loss_clip": 0.01197778, + "auxiliary_loss_mlp": 0.01091082, + "balance_loss_clip": 1.05091298, + "balance_loss_mlp": 1.00782657, + "epoch": 0.1331088799374737, + "flos": 21032413361280.0, + "grad_norm": 1.5462455001847886, + "language_loss": 0.78504145, + "learning_rate": 3.889615283094119e-06, + "loss": 0.80792999, + "num_input_tokens_seen": 23523385, + "step": 1107, + "time_per_iteration": 2.722666025161743 + }, + { + "auxiliary_loss_clip": 0.01197488, + "auxiliary_loss_mlp": 0.01093278, + "balance_loss_clip": 1.05083871, + "balance_loss_mlp": 1.00935459, + "epoch": 0.13322912282811278, + "flos": 18260432985600.0, + "grad_norm": 5.075025305323462, + "language_loss": 0.84924924, + "learning_rate": 3.889359928736485e-06, + "loss": 0.87215686, + "num_input_tokens_seen": 23541330, + "step": 1108, + "time_per_iteration": 2.662385940551758 + }, + { + "auxiliary_loss_clip": 0.01162836, + "auxiliary_loss_mlp": 0.00874908, + "balance_loss_clip": 1.04349482, + "balance_loss_mlp": 1.00014496, + "epoch": 0.1333493657187519, + "flos": 24460876656000.0, + "grad_norm": 2.046550109177344, + "language_loss": 0.90809441, + "learning_rate": 3.889104287764451e-06, + "loss": 0.9284718, + "num_input_tokens_seen": 23561705, + "step": 1109, + "time_per_iteration": 2.812265157699585 + }, + { + "auxiliary_loss_clip": 0.01173817, + "auxiliary_loss_mlp": 0.01091745, + "balance_loss_clip": 1.04760194, + "balance_loss_mlp": 1.00872731, + "epoch": 0.13346960860939097, + "flos": 22158930677760.0, + "grad_norm": 2.4425423247769205, + "language_loss": 0.90085006, + "learning_rate": 3.888848360216798e-06, + "loss": 0.92350566, + "num_input_tokens_seen": 23579350, + "step": 1110, + "time_per_iteration": 4.65586256980896 + }, + { + "auxiliary_loss_clip": 0.01178688, + "auxiliary_loss_mlp": 0.01081614, + "balance_loss_clip": 1.05994606, + "balance_loss_mlp": 1.00112379, + "epoch": 0.13358985150003005, + "flos": 67931212608000.0, + "grad_norm": 0.7990447359655081, + "language_loss": 0.56626445, + "learning_rate": 3.888592146132351e-06, + "loss": 0.58886749, + "num_input_tokens_seen": 23640620, + "step": 1111, + "time_per_iteration": 4.440216064453125 + }, + { + "auxiliary_loss_clip": 0.01186718, + "auxiliary_loss_mlp": 0.01094117, + "balance_loss_clip": 1.0506531, + "balance_loss_mlp": 1.01062286, + "epoch": 0.13371009439066917, + "flos": 26834284742400.0, + "grad_norm": 1.733683984872842, + "language_loss": 0.78532255, + "learning_rate": 3.888335645549978e-06, + "loss": 0.80813086, + "num_input_tokens_seen": 23661040, + "step": 1112, + "time_per_iteration": 2.8454318046569824 + }, + { + "auxiliary_loss_clip": 0.01198177, + "auxiliary_loss_mlp": 0.01095377, + "balance_loss_clip": 1.05183673, + "balance_loss_mlp": 1.01207364, + "epoch": 0.13383033728130825, + "flos": 26322844942080.0, + "grad_norm": 2.1959140575597447, + "language_loss": 0.81612873, + "learning_rate": 3.888078858508588e-06, + "loss": 0.83906424, + "num_input_tokens_seen": 23680900, + "step": 1113, + "time_per_iteration": 3.7701852321624756 + }, + { + "auxiliary_loss_clip": 0.01173302, + "auxiliary_loss_mlp": 0.01093151, + "balance_loss_clip": 1.04805231, + "balance_loss_mlp": 1.00965691, + "epoch": 0.13395058017194733, + "flos": 22563931501440.0, + "grad_norm": 1.9088054506489063, + "language_loss": 0.84394896, + "learning_rate": 3.8878217850471365e-06, + "loss": 0.86661351, + "num_input_tokens_seen": 23700815, + "step": 1114, + "time_per_iteration": 2.7574329376220703 + }, + { + "auxiliary_loss_clip": 0.0119956, + "auxiliary_loss_mlp": 0.01092868, + "balance_loss_clip": 1.05320239, + "balance_loss_mlp": 1.0092783, + "epoch": 0.13407082306258641, + "flos": 25810938264960.0, + "grad_norm": 1.9664013340100777, + "language_loss": 0.73984033, + "learning_rate": 3.887564425204621e-06, + "loss": 0.76276457, + "num_input_tokens_seen": 23722500, + "step": 1115, + "time_per_iteration": 2.791365385055542 + }, + { + "auxiliary_loss_clip": 0.01162025, + "auxiliary_loss_mlp": 0.01081293, + "balance_loss_clip": 1.05300379, + "balance_loss_mlp": 1.00080287, + "epoch": 0.13419106595322552, + "flos": 68338365269760.0, + "grad_norm": 0.8436422165421904, + "language_loss": 0.54656833, + "learning_rate": 3.887306779020083e-06, + "loss": 0.5690015, + "num_input_tokens_seen": 23777155, + "step": 1116, + "time_per_iteration": 3.308973789215088 + }, + { + "auxiliary_loss_clip": 0.01187981, + "auxiliary_loss_mlp": 0.0109041, + "balance_loss_clip": 1.05081844, + "balance_loss_mlp": 1.00686765, + "epoch": 0.1343113088438646, + "flos": 20449080489600.0, + "grad_norm": 2.427689000659638, + "language_loss": 0.70191622, + "learning_rate": 3.887048846532608e-06, + "loss": 0.72470009, + "num_input_tokens_seen": 23794130, + "step": 1117, + "time_per_iteration": 2.7112865447998047 + }, + { + "auxiliary_loss_clip": 0.01158694, + "auxiliary_loss_mlp": 0.01082522, + "balance_loss_clip": 1.05776453, + "balance_loss_mlp": 1.0020318, + "epoch": 0.1344315517345037, + "flos": 67389784951680.0, + "grad_norm": 0.7593149971096612, + "language_loss": 0.5811224, + "learning_rate": 3.8867906277813224e-06, + "loss": 0.60353458, + "num_input_tokens_seen": 23852285, + "step": 1118, + "time_per_iteration": 3.3749139308929443 + }, + { + "auxiliary_loss_clip": 0.01190917, + "auxiliary_loss_mlp": 0.0087489, + "balance_loss_clip": 1.0531404, + "balance_loss_mlp": 1.00014687, + "epoch": 0.1345517946251428, + "flos": 40734442788480.0, + "grad_norm": 1.8458102406976122, + "language_loss": 0.73916519, + "learning_rate": 3.886532122805399e-06, + "loss": 0.75982332, + "num_input_tokens_seen": 23874765, + "step": 1119, + "time_per_iteration": 2.863098382949829 + }, + { + "auxiliary_loss_clip": 0.01132967, + "auxiliary_loss_mlp": 0.01093937, + "balance_loss_clip": 1.04528832, + "balance_loss_mlp": 1.01039553, + "epoch": 0.13467203751578188, + "flos": 22816850140800.0, + "grad_norm": 1.8084782435787459, + "language_loss": 0.89889407, + "learning_rate": 3.886273331644053e-06, + "loss": 0.92116314, + "num_input_tokens_seen": 23893635, + "step": 1120, + "time_per_iteration": 2.9002838134765625 + }, + { + "auxiliary_loss_clip": 0.01147943, + "auxiliary_loss_mlp": 0.01094287, + "balance_loss_clip": 1.04194796, + "balance_loss_mlp": 1.01093602, + "epoch": 0.13479228040642097, + "flos": 17091576512640.0, + "grad_norm": 1.8661703102728575, + "language_loss": 0.82205558, + "learning_rate": 3.886014254336542e-06, + "loss": 0.84447789, + "num_input_tokens_seen": 23910110, + "step": 1121, + "time_per_iteration": 2.866520881652832 + }, + { + "auxiliary_loss_clip": 0.0118225, + "auxiliary_loss_mlp": 0.0109423, + "balance_loss_clip": 1.05091786, + "balance_loss_mlp": 1.01073551, + "epoch": 0.13491252329706005, + "flos": 23730525417600.0, + "grad_norm": 1.681496751536853, + "language_loss": 0.92842263, + "learning_rate": 3.885754890922168e-06, + "loss": 0.95118743, + "num_input_tokens_seen": 23930440, + "step": 1122, + "time_per_iteration": 2.7810261249542236 + }, + { + "auxiliary_loss_clip": 0.01118785, + "auxiliary_loss_mlp": 0.01092531, + "balance_loss_clip": 1.03868151, + "balance_loss_mlp": 1.00903702, + "epoch": 0.13503276618769916, + "flos": 34127058960000.0, + "grad_norm": 1.754783612532569, + "language_loss": 0.78687024, + "learning_rate": 3.885495241440277e-06, + "loss": 0.80898345, + "num_input_tokens_seen": 23954535, + "step": 1123, + "time_per_iteration": 3.0242881774902344 + }, + { + "auxiliary_loss_clip": 0.01196853, + "auxiliary_loss_mlp": 0.01091744, + "balance_loss_clip": 1.05132496, + "balance_loss_mlp": 1.00825036, + "epoch": 0.13515300907833824, + "flos": 17712328377600.0, + "grad_norm": 1.7397444916767066, + "language_loss": 0.74402553, + "learning_rate": 3.885235305930257e-06, + "loss": 0.76691151, + "num_input_tokens_seen": 23972735, + "step": 1124, + "time_per_iteration": 2.610348701477051 + }, + { + "auxiliary_loss_clip": 0.01172309, + "auxiliary_loss_mlp": 0.01093752, + "balance_loss_clip": 1.05276, + "balance_loss_mlp": 1.01040077, + "epoch": 0.13527325196897733, + "flos": 20260872201600.0, + "grad_norm": 1.968363398483404, + "language_loss": 0.85432076, + "learning_rate": 3.884975084431539e-06, + "loss": 0.87698132, + "num_input_tokens_seen": 23987685, + "step": 1125, + "time_per_iteration": 2.8027865886688232 + }, + { + "auxiliary_loss_clip": 0.01190844, + "auxiliary_loss_mlp": 0.00874933, + "balance_loss_clip": 1.05344141, + "balance_loss_mlp": 1.00009298, + "epoch": 0.13539349485961644, + "flos": 18186492839040.0, + "grad_norm": 2.240506471662462, + "language_loss": 0.91146922, + "learning_rate": 3.8847145769836e-06, + "loss": 0.932127, + "num_input_tokens_seen": 24004105, + "step": 1126, + "time_per_iteration": 2.7689356803894043 + }, + { + "auxiliary_loss_clip": 0.01200212, + "auxiliary_loss_mlp": 0.01093832, + "balance_loss_clip": 1.05411839, + "balance_loss_mlp": 1.01009965, + "epoch": 0.13551373775025552, + "flos": 19317463441920.0, + "grad_norm": 2.526116198567434, + "language_loss": 0.66451705, + "learning_rate": 3.884453783625959e-06, + "loss": 0.68745756, + "num_input_tokens_seen": 24021715, + "step": 1127, + "time_per_iteration": 2.752272844314575 + }, + { + "auxiliary_loss_clip": 0.01168381, + "auxiliary_loss_mlp": 0.01093584, + "balance_loss_clip": 1.04818165, + "balance_loss_mlp": 1.01013803, + "epoch": 0.1356339806408946, + "flos": 20850813175680.0, + "grad_norm": 3.229749602309809, + "language_loss": 0.84855795, + "learning_rate": 3.884192704398176e-06, + "loss": 0.87117761, + "num_input_tokens_seen": 24038915, + "step": 1128, + "time_per_iteration": 2.7766835689544678 + }, + { + "auxiliary_loss_clip": 0.01190456, + "auxiliary_loss_mlp": 0.01092224, + "balance_loss_clip": 1.05222511, + "balance_loss_mlp": 1.0087775, + "epoch": 0.13575422353153369, + "flos": 50476037696640.0, + "grad_norm": 1.5492836874831313, + "language_loss": 0.74292314, + "learning_rate": 3.883931339339858e-06, + "loss": 0.76574993, + "num_input_tokens_seen": 24063300, + "step": 1129, + "time_per_iteration": 2.9532294273376465 + }, + { + "auxiliary_loss_clip": 0.01184432, + "auxiliary_loss_mlp": 0.01091729, + "balance_loss_clip": 1.04725695, + "balance_loss_mlp": 1.00818682, + "epoch": 0.1358744664221728, + "flos": 18150797698560.0, + "grad_norm": 3.115983637190077, + "language_loss": 0.786672, + "learning_rate": 3.883669688490654e-06, + "loss": 0.80943358, + "num_input_tokens_seen": 24081070, + "step": 1130, + "time_per_iteration": 2.6418633460998535 + }, + { + "auxiliary_loss_clip": 0.01178878, + "auxiliary_loss_mlp": 0.00874941, + "balance_loss_clip": 1.05137455, + "balance_loss_mlp": 1.00014925, + "epoch": 0.13599470931281188, + "flos": 18442966924800.0, + "grad_norm": 2.061226945082278, + "language_loss": 0.857867, + "learning_rate": 3.883407751890256e-06, + "loss": 0.87840521, + "num_input_tokens_seen": 24099675, + "step": 1131, + "time_per_iteration": 2.768510103225708 + }, + { + "auxiliary_loss_clip": 0.01168048, + "auxiliary_loss_mlp": 0.01095638, + "balance_loss_clip": 1.04821157, + "balance_loss_mlp": 1.01181018, + "epoch": 0.13611495220345096, + "flos": 26680766014080.0, + "grad_norm": 1.6607377980425202, + "language_loss": 0.85896665, + "learning_rate": 3.8831455295783994e-06, + "loss": 0.88160348, + "num_input_tokens_seen": 24118925, + "step": 1132, + "time_per_iteration": 2.8404083251953125 + }, + { + "auxiliary_loss_clip": 0.01179841, + "auxiliary_loss_mlp": 0.01092637, + "balance_loss_clip": 1.05072308, + "balance_loss_mlp": 1.00919032, + "epoch": 0.13623519509409007, + "flos": 21686238673920.0, + "grad_norm": 2.0215440194998933, + "language_loss": 0.74283898, + "learning_rate": 3.882883021594864e-06, + "loss": 0.76556373, + "num_input_tokens_seen": 24137065, + "step": 1133, + "time_per_iteration": 2.732937812805176 + }, + { + "auxiliary_loss_clip": 0.01154886, + "auxiliary_loss_mlp": 0.01092915, + "balance_loss_clip": 1.04564309, + "balance_loss_mlp": 1.00951612, + "epoch": 0.13635543798472916, + "flos": 14830389492480.0, + "grad_norm": 1.8964258622178065, + "language_loss": 0.86800951, + "learning_rate": 3.8826202279794705e-06, + "loss": 0.89048755, + "num_input_tokens_seen": 24154125, + "step": 1134, + "time_per_iteration": 2.771980047225952 + }, + { + "auxiliary_loss_clip": 0.01199517, + "auxiliary_loss_mlp": 0.0109131, + "balance_loss_clip": 1.05376887, + "balance_loss_mlp": 1.00771999, + "epoch": 0.13647568087536824, + "flos": 22890323410560.0, + "grad_norm": 2.4377893998984876, + "language_loss": 0.70417303, + "learning_rate": 3.882357148772085e-06, + "loss": 0.7270813, + "num_input_tokens_seen": 24171550, + "step": 1135, + "time_per_iteration": 4.56389307975769 + }, + { + "auxiliary_loss_clip": 0.01153086, + "auxiliary_loss_mlp": 0.01090747, + "balance_loss_clip": 1.04875684, + "balance_loss_mlp": 1.00720489, + "epoch": 0.13659592376600732, + "flos": 19937927998080.0, + "grad_norm": 2.4081790454103387, + "language_loss": 0.84109509, + "learning_rate": 3.882093784012617e-06, + "loss": 0.86353344, + "num_input_tokens_seen": 24190190, + "step": 1136, + "time_per_iteration": 3.794400215148926 + }, + { + "auxiliary_loss_clip": 0.01171676, + "auxiliary_loss_mlp": 0.0109323, + "balance_loss_clip": 1.04620111, + "balance_loss_mlp": 1.00983119, + "epoch": 0.13671616665664643, + "flos": 21428579439360.0, + "grad_norm": 1.6924098855741028, + "language_loss": 0.84260023, + "learning_rate": 3.881830133741019e-06, + "loss": 0.86524928, + "num_input_tokens_seen": 24209055, + "step": 1137, + "time_per_iteration": 2.769136905670166 + }, + { + "auxiliary_loss_clip": 0.01161403, + "auxiliary_loss_mlp": 0.01092216, + "balance_loss_clip": 1.04496825, + "balance_loss_mlp": 1.00881708, + "epoch": 0.13683640954728551, + "flos": 22778138257920.0, + "grad_norm": 2.3450807263005173, + "language_loss": 0.76464689, + "learning_rate": 3.881566197997285e-06, + "loss": 0.78718305, + "num_input_tokens_seen": 24225490, + "step": 1138, + "time_per_iteration": 3.77130126953125 + }, + { + "auxiliary_loss_clip": 0.01173377, + "auxiliary_loss_mlp": 0.01093754, + "balance_loss_clip": 1.04884863, + "balance_loss_mlp": 1.01059401, + "epoch": 0.1369566524379246, + "flos": 21725884310400.0, + "grad_norm": 1.670372436567903, + "language_loss": 0.7486667, + "learning_rate": 3.881301976821456e-06, + "loss": 0.77133799, + "num_input_tokens_seen": 24245520, + "step": 1139, + "time_per_iteration": 2.728309392929077 + }, + { + "auxiliary_loss_clip": 0.01187294, + "auxiliary_loss_mlp": 0.01092522, + "balance_loss_clip": 1.05265212, + "balance_loss_mlp": 1.00912321, + "epoch": 0.1370768953285637, + "flos": 18624459369600.0, + "grad_norm": 1.9430402124898594, + "language_loss": 0.90351272, + "learning_rate": 3.881037470253612e-06, + "loss": 0.9263109, + "num_input_tokens_seen": 24265035, + "step": 1140, + "time_per_iteration": 2.8673739433288574 + }, + { + "auxiliary_loss_clip": 0.01154449, + "auxiliary_loss_mlp": 0.01096544, + "balance_loss_clip": 1.04755664, + "balance_loss_mlp": 1.01314473, + "epoch": 0.1371971382192028, + "flos": 14939521989120.0, + "grad_norm": 7.533218531207799, + "language_loss": 0.79943204, + "learning_rate": 3.88077267833388e-06, + "loss": 0.82194203, + "num_input_tokens_seen": 24281550, + "step": 1141, + "time_per_iteration": 2.806626796722412 + }, + { + "auxiliary_loss_clip": 0.01151521, + "auxiliary_loss_mlp": 0.01094192, + "balance_loss_clip": 1.04362833, + "balance_loss_mlp": 1.0106976, + "epoch": 0.13731738110984187, + "flos": 19023785844480.0, + "grad_norm": 2.060215140904391, + "language_loss": 0.83911479, + "learning_rate": 3.880507601102427e-06, + "loss": 0.86157191, + "num_input_tokens_seen": 24299485, + "step": 1142, + "time_per_iteration": 2.792698383331299 + }, + { + "auxiliary_loss_clip": 0.01199586, + "auxiliary_loss_mlp": 0.01093445, + "balance_loss_clip": 1.05422544, + "balance_loss_mlp": 1.01009357, + "epoch": 0.13743762400048098, + "flos": 18187462506240.0, + "grad_norm": 2.0198762387433145, + "language_loss": 0.81904924, + "learning_rate": 3.880242238599467e-06, + "loss": 0.8419795, + "num_input_tokens_seen": 24316010, + "step": 1143, + "time_per_iteration": 2.614023208618164 + }, + { + "auxiliary_loss_clip": 0.01198696, + "auxiliary_loss_mlp": 0.01093672, + "balance_loss_clip": 1.05354595, + "balance_loss_mlp": 1.01027346, + "epoch": 0.13755786689112007, + "flos": 21031982398080.0, + "grad_norm": 1.6290418211989561, + "language_loss": 0.8313489, + "learning_rate": 3.879976590865254e-06, + "loss": 0.8542726, + "num_input_tokens_seen": 24335465, + "step": 1144, + "time_per_iteration": 2.7049763202667236 + }, + { + "auxiliary_loss_clip": 0.01176993, + "auxiliary_loss_mlp": 0.01095753, + "balance_loss_clip": 1.04983926, + "balance_loss_mlp": 1.01244962, + "epoch": 0.13767810978175915, + "flos": 21360636864000.0, + "grad_norm": 1.9095839978719689, + "language_loss": 0.87305182, + "learning_rate": 3.879710657940087e-06, + "loss": 0.89577925, + "num_input_tokens_seen": 24354415, + "step": 1145, + "time_per_iteration": 2.7591495513916016 + }, + { + "auxiliary_loss_clip": 0.01186122, + "auxiliary_loss_mlp": 0.01092547, + "balance_loss_clip": 1.05048013, + "balance_loss_mlp": 1.00891018, + "epoch": 0.13779835267239823, + "flos": 30592084861440.0, + "grad_norm": 2.0600877269858033, + "language_loss": 0.70336819, + "learning_rate": 3.879444439864308e-06, + "loss": 0.72615486, + "num_input_tokens_seen": 24373990, + "step": 1146, + "time_per_iteration": 2.7475342750549316 + }, + { + "auxiliary_loss_clip": 0.01180866, + "auxiliary_loss_mlp": 0.00874998, + "balance_loss_clip": 1.04968762, + "balance_loss_mlp": 1.00021052, + "epoch": 0.13791859556303734, + "flos": 22669867687680.0, + "grad_norm": 2.248072932094013, + "language_loss": 0.86177623, + "learning_rate": 3.879177936678301e-06, + "loss": 0.88233483, + "num_input_tokens_seen": 24392995, + "step": 1147, + "time_per_iteration": 2.744981288909912 + }, + { + "auxiliary_loss_clip": 0.01178253, + "auxiliary_loss_mlp": 0.01095013, + "balance_loss_clip": 1.04985785, + "balance_loss_mlp": 1.01123238, + "epoch": 0.13803883845367643, + "flos": 35224166016000.0, + "grad_norm": 1.8159419812483897, + "language_loss": 0.77315617, + "learning_rate": 3.878911148422496e-06, + "loss": 0.79588878, + "num_input_tokens_seen": 24414470, + "step": 1148, + "time_per_iteration": 2.8655591011047363 + }, + { + "auxiliary_loss_clip": 0.01187955, + "auxiliary_loss_mlp": 0.01090943, + "balance_loss_clip": 1.05176377, + "balance_loss_mlp": 1.00749636, + "epoch": 0.1381590813443155, + "flos": 32014542332160.0, + "grad_norm": 3.0327701489574954, + "language_loss": 0.70434076, + "learning_rate": 3.878644075137364e-06, + "loss": 0.72712982, + "num_input_tokens_seen": 24435120, + "step": 1149, + "time_per_iteration": 2.8520781993865967 + }, + { + "auxiliary_loss_clip": 0.01158486, + "auxiliary_loss_mlp": 0.01090362, + "balance_loss_clip": 1.04730463, + "balance_loss_mlp": 1.00682032, + "epoch": 0.13827932423495462, + "flos": 17821855923840.0, + "grad_norm": 2.386411133285662, + "language_loss": 0.79240918, + "learning_rate": 3.878376716863418e-06, + "loss": 0.81489766, + "num_input_tokens_seen": 24451420, + "step": 1150, + "time_per_iteration": 2.735572576522827 + }, + { + "auxiliary_loss_clip": 0.01163481, + "auxiliary_loss_mlp": 0.0109225, + "balance_loss_clip": 1.04480124, + "balance_loss_mlp": 1.00851703, + "epoch": 0.1383995671255937, + "flos": 19427098728960.0, + "grad_norm": 1.8459675385112542, + "language_loss": 0.71532822, + "learning_rate": 3.878109073641219e-06, + "loss": 0.73788553, + "num_input_tokens_seen": 24470450, + "step": 1151, + "time_per_iteration": 2.709705114364624 + }, + { + "auxiliary_loss_clip": 0.01144221, + "auxiliary_loss_mlp": 0.01094102, + "balance_loss_clip": 1.03924978, + "balance_loss_mlp": 1.01070333, + "epoch": 0.13851981001623279, + "flos": 28296603331200.0, + "grad_norm": 1.7067283027808864, + "language_loss": 0.81080055, + "learning_rate": 3.877841145511366e-06, + "loss": 0.83318377, + "num_input_tokens_seen": 24493190, + "step": 1152, + "time_per_iteration": 2.893510341644287 + }, + { + "auxiliary_loss_clip": 0.01189621, + "auxiliary_loss_mlp": 0.01093259, + "balance_loss_clip": 1.05241513, + "balance_loss_mlp": 1.0096221, + "epoch": 0.13864005290687187, + "flos": 21213079793280.0, + "grad_norm": 1.6611594011758835, + "language_loss": 0.82794112, + "learning_rate": 3.8775729325145035e-06, + "loss": 0.85076994, + "num_input_tokens_seen": 24512425, + "step": 1153, + "time_per_iteration": 2.74983286857605 + }, + { + "auxiliary_loss_clip": 0.01138296, + "auxiliary_loss_mlp": 0.01082561, + "balance_loss_clip": 1.04825592, + "balance_loss_mlp": 1.00207043, + "epoch": 0.13876029579751098, + "flos": 71653389413760.0, + "grad_norm": 2.3870616479483995, + "language_loss": 0.64715874, + "learning_rate": 3.877304434691321e-06, + "loss": 0.66936731, + "num_input_tokens_seen": 24579275, + "step": 1154, + "time_per_iteration": 3.4253056049346924 + }, + { + "auxiliary_loss_clip": 0.01162176, + "auxiliary_loss_mlp": 0.01090686, + "balance_loss_clip": 1.04600215, + "balance_loss_mlp": 1.00757337, + "epoch": 0.13888053868815006, + "flos": 21941348042880.0, + "grad_norm": 1.7164299599195925, + "language_loss": 0.79983121, + "learning_rate": 3.877035652082548e-06, + "loss": 0.8223598, + "num_input_tokens_seen": 24598720, + "step": 1155, + "time_per_iteration": 2.7747015953063965 + }, + { + "auxiliary_loss_clip": 0.0116906, + "auxiliary_loss_mlp": 0.01093679, + "balance_loss_clip": 1.05074275, + "balance_loss_mlp": 1.01004195, + "epoch": 0.13900078157878915, + "flos": 19608627087360.0, + "grad_norm": 2.1381432733126, + "language_loss": 0.85476732, + "learning_rate": 3.87676658472896e-06, + "loss": 0.87739474, + "num_input_tokens_seen": 24617530, + "step": 1156, + "time_per_iteration": 2.764705181121826 + }, + { + "auxiliary_loss_clip": 0.01179096, + "auxiliary_loss_mlp": 0.01092324, + "balance_loss_clip": 1.04854345, + "balance_loss_mlp": 1.00863934, + "epoch": 0.13912102446942826, + "flos": 22638051216000.0, + "grad_norm": 2.515576042620916, + "language_loss": 0.85312074, + "learning_rate": 3.876497232671372e-06, + "loss": 0.87583494, + "num_input_tokens_seen": 24637485, + "step": 1157, + "time_per_iteration": 2.6952028274536133 + }, + { + "auxiliary_loss_clip": 0.01153998, + "auxiliary_loss_mlp": 0.01092465, + "balance_loss_clip": 1.04489255, + "balance_loss_mlp": 1.00897074, + "epoch": 0.13924126736006734, + "flos": 29643324975360.0, + "grad_norm": 2.4099730401349846, + "language_loss": 0.83385336, + "learning_rate": 3.876227595950647e-06, + "loss": 0.85631806, + "num_input_tokens_seen": 24656915, + "step": 1158, + "time_per_iteration": 2.8630995750427246 + }, + { + "auxiliary_loss_clip": 0.01197171, + "auxiliary_loss_mlp": 0.0109366, + "balance_loss_clip": 1.05288327, + "balance_loss_mlp": 1.01040399, + "epoch": 0.13936151025070642, + "flos": 27417653527680.0, + "grad_norm": 1.6643005688762609, + "language_loss": 0.78674561, + "learning_rate": 3.875957674607686e-06, + "loss": 0.80965388, + "num_input_tokens_seen": 24679190, + "step": 1159, + "time_per_iteration": 2.7067043781280518 + }, + { + "auxiliary_loss_clip": 0.01184506, + "auxiliary_loss_mlp": 0.0087513, + "balance_loss_clip": 1.04847622, + "balance_loss_mlp": 1.00024581, + "epoch": 0.1394817531413455, + "flos": 16399326625920.0, + "grad_norm": 4.20620759651745, + "language_loss": 0.88067627, + "learning_rate": 3.8756874686834386e-06, + "loss": 0.90127259, + "num_input_tokens_seen": 24697405, + "step": 1160, + "time_per_iteration": 2.764983654022217 + }, + { + "auxiliary_loss_clip": 0.01186422, + "auxiliary_loss_mlp": 0.00875045, + "balance_loss_clip": 1.05008006, + "balance_loss_mlp": 1.00022078, + "epoch": 0.13960199603198462, + "flos": 30922319525760.0, + "grad_norm": 1.6225513423909714, + "language_loss": 0.80132765, + "learning_rate": 3.875416978218893e-06, + "loss": 0.82194233, + "num_input_tokens_seen": 24720600, + "step": 1161, + "time_per_iteration": 3.761507987976074 + }, + { + "auxiliary_loss_clip": 0.01170428, + "auxiliary_loss_mlp": 0.01093972, + "balance_loss_clip": 1.04922247, + "balance_loss_mlp": 1.01028705, + "epoch": 0.1397222389226237, + "flos": 18113773754880.0, + "grad_norm": 2.8581291538705096, + "language_loss": 0.82890344, + "learning_rate": 3.8751462032550835e-06, + "loss": 0.85154748, + "num_input_tokens_seen": 24737605, + "step": 1162, + "time_per_iteration": 3.7602670192718506 + }, + { + "auxiliary_loss_clip": 0.0116886, + "auxiliary_loss_mlp": 0.01091918, + "balance_loss_clip": 1.04998207, + "balance_loss_mlp": 1.0088526, + "epoch": 0.13984248181326278, + "flos": 16872772815360.0, + "grad_norm": 2.5435920123874496, + "language_loss": 0.82878399, + "learning_rate": 3.874875143833085e-06, + "loss": 0.85139179, + "num_input_tokens_seen": 24755845, + "step": 1163, + "time_per_iteration": 3.7210211753845215 + }, + { + "auxiliary_loss_clip": 0.01184116, + "auxiliary_loss_mlp": 0.0109339, + "balance_loss_clip": 1.04941332, + "balance_loss_mlp": 1.00951433, + "epoch": 0.1399627247039019, + "flos": 54121401267840.0, + "grad_norm": 3.3187905917120344, + "language_loss": 0.68924731, + "learning_rate": 3.874603799994019e-06, + "loss": 0.71202242, + "num_input_tokens_seen": 24779380, + "step": 1164, + "time_per_iteration": 3.0113110542297363 + }, + { + "auxiliary_loss_clip": 0.0116818, + "auxiliary_loss_mlp": 0.01093658, + "balance_loss_clip": 1.04993963, + "balance_loss_mlp": 1.01035404, + "epoch": 0.14008296759454097, + "flos": 11765521618560.0, + "grad_norm": 2.258176524840277, + "language_loss": 0.86795008, + "learning_rate": 3.874332171779046e-06, + "loss": 0.89056844, + "num_input_tokens_seen": 24794260, + "step": 1165, + "time_per_iteration": 2.763604164123535 + }, + { + "auxiliary_loss_clip": 0.01157747, + "auxiliary_loss_mlp": 0.01096252, + "balance_loss_clip": 1.04307723, + "balance_loss_mlp": 1.01270962, + "epoch": 0.14020321048518006, + "flos": 22017514832640.0, + "grad_norm": 1.7131273773332987, + "language_loss": 0.75499117, + "learning_rate": 3.874060259229373e-06, + "loss": 0.77753115, + "num_input_tokens_seen": 24815835, + "step": 1166, + "time_per_iteration": 2.848085403442383 + }, + { + "auxiliary_loss_clip": 0.0118627, + "auxiliary_loss_mlp": 0.01096849, + "balance_loss_clip": 1.05173755, + "balance_loss_mlp": 1.01302075, + "epoch": 0.14032345337581917, + "flos": 23404313076480.0, + "grad_norm": 2.409983499170033, + "language_loss": 0.93766123, + "learning_rate": 3.873788062386249e-06, + "loss": 0.96049243, + "num_input_tokens_seen": 24834095, + "step": 1167, + "time_per_iteration": 2.717465877532959 + }, + { + "auxiliary_loss_clip": 0.01162945, + "auxiliary_loss_mlp": 0.01092208, + "balance_loss_clip": 1.04599154, + "balance_loss_mlp": 1.00871396, + "epoch": 0.14044369626645825, + "flos": 29645767100160.0, + "grad_norm": 1.7590676394151092, + "language_loss": 0.82012856, + "learning_rate": 3.873515581290965e-06, + "loss": 0.8426801, + "num_input_tokens_seen": 24858900, + "step": 1168, + "time_per_iteration": 2.8977015018463135 + }, + { + "auxiliary_loss_clip": 0.01163113, + "auxiliary_loss_mlp": 0.01092562, + "balance_loss_clip": 1.04754806, + "balance_loss_mlp": 1.00916362, + "epoch": 0.14056393915709733, + "flos": 18332972501760.0, + "grad_norm": 2.966204928683065, + "language_loss": 0.75801849, + "learning_rate": 3.8732428159848575e-06, + "loss": 0.78057522, + "num_input_tokens_seen": 24877875, + "step": 1169, + "time_per_iteration": 2.8035364151000977 + }, + { + "auxiliary_loss_clip": 0.01184714, + "auxiliary_loss_mlp": 0.01094394, + "balance_loss_clip": 1.05129254, + "balance_loss_mlp": 1.01066148, + "epoch": 0.14068418204773642, + "flos": 26687517770880.0, + "grad_norm": 2.1931912927036947, + "language_loss": 0.78162968, + "learning_rate": 3.872969766509304e-06, + "loss": 0.80442071, + "num_input_tokens_seen": 24898430, + "step": 1170, + "time_per_iteration": 2.808412551879883 + }, + { + "auxiliary_loss_clip": 0.01156659, + "auxiliary_loss_mlp": 0.01080566, + "balance_loss_clip": 1.05173707, + "balance_loss_mlp": 1.00007558, + "epoch": 0.14080442493837553, + "flos": 65259314501760.0, + "grad_norm": 0.7637620522951755, + "language_loss": 0.5563724, + "learning_rate": 3.872696432905726e-06, + "loss": 0.57874465, + "num_input_tokens_seen": 24959250, + "step": 1171, + "time_per_iteration": 3.356294631958008 + }, + { + "auxiliary_loss_clip": 0.01184851, + "auxiliary_loss_mlp": 0.01094202, + "balance_loss_clip": 1.04924953, + "balance_loss_mlp": 1.01027846, + "epoch": 0.1409246678290146, + "flos": 25776715582080.0, + "grad_norm": 2.3511750115037566, + "language_loss": 0.71710098, + "learning_rate": 3.872422815215589e-06, + "loss": 0.73989141, + "num_input_tokens_seen": 24978330, + "step": 1172, + "time_per_iteration": 2.6928887367248535 + }, + { + "auxiliary_loss_clip": 0.01185414, + "auxiliary_loss_mlp": 0.01093804, + "balance_loss_clip": 1.04873264, + "balance_loss_mlp": 1.01002336, + "epoch": 0.1410449107196537, + "flos": 21868521217920.0, + "grad_norm": 1.782076850287829, + "language_loss": 0.74027389, + "learning_rate": 3.8721489134803994e-06, + "loss": 0.76306611, + "num_input_tokens_seen": 24997120, + "step": 1173, + "time_per_iteration": 2.741210699081421 + }, + { + "auxiliary_loss_clip": 0.01177796, + "auxiliary_loss_mlp": 0.0109289, + "balance_loss_clip": 1.04574549, + "balance_loss_mlp": 1.00934839, + "epoch": 0.1411651536102928, + "flos": 16684133564160.0, + "grad_norm": 2.3226277637577417, + "language_loss": 0.72371733, + "learning_rate": 3.871874727741707e-06, + "loss": 0.7464242, + "num_input_tokens_seen": 25014350, + "step": 1174, + "time_per_iteration": 2.6872167587280273 + }, + { + "auxiliary_loss_clip": 0.01179815, + "auxiliary_loss_mlp": 0.01093042, + "balance_loss_clip": 1.04876935, + "balance_loss_mlp": 1.00983369, + "epoch": 0.1412853965009319, + "flos": 20992264934400.0, + "grad_norm": 1.7455578307629749, + "language_loss": 0.96553952, + "learning_rate": 3.871600258041108e-06, + "loss": 0.98826814, + "num_input_tokens_seen": 25033875, + "step": 1175, + "time_per_iteration": 2.687286138534546 + }, + { + "auxiliary_loss_clip": 0.01177799, + "auxiliary_loss_mlp": 0.01093051, + "balance_loss_clip": 1.05069208, + "balance_loss_mlp": 1.00941396, + "epoch": 0.14140563939157097, + "flos": 20335279224960.0, + "grad_norm": 2.1946520261080984, + "language_loss": 0.85727704, + "learning_rate": 3.871325504420238e-06, + "loss": 0.87998557, + "num_input_tokens_seen": 25052865, + "step": 1176, + "time_per_iteration": 2.8804666996002197 + }, + { + "auxiliary_loss_clip": 0.01195883, + "auxiliary_loss_mlp": 0.01091277, + "balance_loss_clip": 1.05217838, + "balance_loss_mlp": 1.00759256, + "epoch": 0.14152588228221005, + "flos": 21068826773760.0, + "grad_norm": 2.4184256650596505, + "language_loss": 0.81817418, + "learning_rate": 3.871050466920776e-06, + "loss": 0.8410458, + "num_input_tokens_seen": 25072770, + "step": 1177, + "time_per_iteration": 2.7438220977783203 + }, + { + "auxiliary_loss_clip": 0.01161104, + "auxiliary_loss_mlp": 0.0109399, + "balance_loss_clip": 1.04961586, + "balance_loss_mlp": 1.01063919, + "epoch": 0.14164612517284916, + "flos": 18223157646720.0, + "grad_norm": 4.288471434964554, + "language_loss": 0.79399943, + "learning_rate": 3.870775145584447e-06, + "loss": 0.81655043, + "num_input_tokens_seen": 25090550, + "step": 1178, + "time_per_iteration": 2.7586519718170166 + }, + { + "auxiliary_loss_clip": 0.01174542, + "auxiliary_loss_mlp": 0.01091278, + "balance_loss_clip": 1.04852057, + "balance_loss_mlp": 1.00721204, + "epoch": 0.14176636806348825, + "flos": 22744454279040.0, + "grad_norm": 3.1636306985525793, + "language_loss": 0.64641207, + "learning_rate": 3.8704995404530145e-06, + "loss": 0.6690703, + "num_input_tokens_seen": 25106175, + "step": 1179, + "time_per_iteration": 2.699903726577759 + }, + { + "auxiliary_loss_clip": 0.01196301, + "auxiliary_loss_mlp": 0.01094162, + "balance_loss_clip": 1.05312419, + "balance_loss_mlp": 1.01090646, + "epoch": 0.14188661095412733, + "flos": 22091095843200.0, + "grad_norm": 1.922067364597385, + "language_loss": 0.84840405, + "learning_rate": 3.87022365156829e-06, + "loss": 0.87130862, + "num_input_tokens_seen": 25126890, + "step": 1180, + "time_per_iteration": 2.753998041152954 + }, + { + "auxiliary_loss_clip": 0.01127442, + "auxiliary_loss_mlp": 0.01092573, + "balance_loss_clip": 1.04402459, + "balance_loss_mlp": 1.00922239, + "epoch": 0.14200685384476644, + "flos": 24352390604160.0, + "grad_norm": 1.931929407502938, + "language_loss": 0.80917788, + "learning_rate": 3.869947478972123e-06, + "loss": 0.83137804, + "num_input_tokens_seen": 25147915, + "step": 1181, + "time_per_iteration": 2.963951349258423 + }, + { + "auxiliary_loss_clip": 0.01186514, + "auxiliary_loss_mlp": 0.01093522, + "balance_loss_clip": 1.05160975, + "balance_loss_mlp": 1.00983739, + "epoch": 0.14212709673540552, + "flos": 24022048199040.0, + "grad_norm": 1.8286670931127662, + "language_loss": 0.82448, + "learning_rate": 3.869671022706412e-06, + "loss": 0.84728038, + "num_input_tokens_seen": 25166645, + "step": 1182, + "time_per_iteration": 2.792623996734619 + }, + { + "auxiliary_loss_clip": 0.01150394, + "auxiliary_loss_mlp": 0.01094947, + "balance_loss_clip": 1.04828179, + "balance_loss_mlp": 1.01121426, + "epoch": 0.1422473396260446, + "flos": 26431797870720.0, + "grad_norm": 1.8261385039842801, + "language_loss": 0.65078688, + "learning_rate": 3.869394282813092e-06, + "loss": 0.6732403, + "num_input_tokens_seen": 25185845, + "step": 1183, + "time_per_iteration": 2.9118704795837402 + }, + { + "auxiliary_loss_clip": 0.0116988, + "auxiliary_loss_mlp": 0.01093722, + "balance_loss_clip": 1.05160546, + "balance_loss_mlp": 1.01013267, + "epoch": 0.1423675825166837, + "flos": 17055306754560.0, + "grad_norm": 2.9254170343216312, + "language_loss": 0.89041013, + "learning_rate": 3.869117259334147e-06, + "loss": 0.91304618, + "num_input_tokens_seen": 25203770, + "step": 1184, + "time_per_iteration": 2.8204681873321533 + }, + { + "auxiliary_loss_clip": 0.01189599, + "auxiliary_loss_mlp": 0.01093074, + "balance_loss_clip": 1.05398083, + "balance_loss_mlp": 1.009866, + "epoch": 0.1424878254073228, + "flos": 17929480049280.0, + "grad_norm": 1.6822942231408675, + "language_loss": 0.82152855, + "learning_rate": 3.868839952311599e-06, + "loss": 0.84435529, + "num_input_tokens_seen": 25221725, + "step": 1185, + "time_per_iteration": 2.753883123397827 + }, + { + "auxiliary_loss_clip": 0.01164211, + "auxiliary_loss_mlp": 0.010923, + "balance_loss_clip": 1.04710722, + "balance_loss_mlp": 1.00885344, + "epoch": 0.14260806829796188, + "flos": 20303606407680.0, + "grad_norm": 2.137447437239634, + "language_loss": 0.80391157, + "learning_rate": 3.868562361787516e-06, + "loss": 0.82647675, + "num_input_tokens_seen": 25240855, + "step": 1186, + "time_per_iteration": 3.604172945022583 + }, + { + "auxiliary_loss_clip": 0.01129301, + "auxiliary_loss_mlp": 0.01092577, + "balance_loss_clip": 1.04392886, + "balance_loss_mlp": 1.00913072, + "epoch": 0.14272831118860096, + "flos": 23185724860800.0, + "grad_norm": 1.9115039696302614, + "language_loss": 0.68875116, + "learning_rate": 3.868284487804009e-06, + "loss": 0.71096992, + "num_input_tokens_seen": 25260085, + "step": 1187, + "time_per_iteration": 4.8652307987213135 + }, + { + "auxiliary_loss_clip": 0.01179343, + "auxiliary_loss_mlp": 0.01090538, + "balance_loss_clip": 1.05262685, + "balance_loss_mlp": 1.00718713, + "epoch": 0.14284855407924008, + "flos": 27232210586880.0, + "grad_norm": 2.1302518298154154, + "language_loss": 0.78283846, + "learning_rate": 3.86800633040323e-06, + "loss": 0.80553728, + "num_input_tokens_seen": 25280675, + "step": 1188, + "time_per_iteration": 2.854229688644409 + }, + { + "auxiliary_loss_clip": 0.01171092, + "auxiliary_loss_mlp": 0.0087493, + "balance_loss_clip": 1.04857433, + "balance_loss_mlp": 1.00025892, + "epoch": 0.14296879696987916, + "flos": 28184202696960.0, + "grad_norm": 1.9131431239265229, + "language_loss": 0.78069401, + "learning_rate": 3.867727889627376e-06, + "loss": 0.8011542, + "num_input_tokens_seen": 25300290, + "step": 1189, + "time_per_iteration": 3.771254062652588 + }, + { + "auxiliary_loss_clip": 0.01161218, + "auxiliary_loss_mlp": 0.01090817, + "balance_loss_clip": 1.04692674, + "balance_loss_mlp": 1.00717926, + "epoch": 0.14308903986051824, + "flos": 19390290266880.0, + "grad_norm": 2.0461931172516077, + "language_loss": 0.78073311, + "learning_rate": 3.867449165518687e-06, + "loss": 0.80325347, + "num_input_tokens_seen": 25316760, + "step": 1190, + "time_per_iteration": 2.7624130249023438 + }, + { + "auxiliary_loss_clip": 0.01194134, + "auxiliary_loss_mlp": 0.00875034, + "balance_loss_clip": 1.05121458, + "balance_loss_mlp": 1.00022435, + "epoch": 0.14320928275115732, + "flos": 17457506317440.0, + "grad_norm": 1.931172562594599, + "language_loss": 0.71067309, + "learning_rate": 3.867170158119444e-06, + "loss": 0.73136485, + "num_input_tokens_seen": 25335760, + "step": 1191, + "time_per_iteration": 2.727844715118408 + }, + { + "auxiliary_loss_clip": 0.01195486, + "auxiliary_loss_mlp": 0.01093019, + "balance_loss_clip": 1.0522089, + "balance_loss_mlp": 1.00928617, + "epoch": 0.14332952564179643, + "flos": 21466070259840.0, + "grad_norm": 2.046838525348445, + "language_loss": 0.75310689, + "learning_rate": 3.866890867471972e-06, + "loss": 0.77599192, + "num_input_tokens_seen": 25354230, + "step": 1192, + "time_per_iteration": 2.7003746032714844 + }, + { + "auxiliary_loss_clip": 0.01179305, + "auxiliary_loss_mlp": 0.01095929, + "balance_loss_clip": 1.05185342, + "balance_loss_mlp": 1.0119102, + "epoch": 0.14344976853243552, + "flos": 16396992241920.0, + "grad_norm": 2.2793970005426574, + "language_loss": 0.90078038, + "learning_rate": 3.86661129361864e-06, + "loss": 0.92353278, + "num_input_tokens_seen": 25368720, + "step": 1193, + "time_per_iteration": 2.734067916870117 + }, + { + "auxiliary_loss_clip": 0.01165622, + "auxiliary_loss_mlp": 0.01092132, + "balance_loss_clip": 1.0433836, + "balance_loss_mlp": 1.00858974, + "epoch": 0.1435700114230746, + "flos": 18916736336640.0, + "grad_norm": 30.98778480925482, + "language_loss": 0.86156601, + "learning_rate": 3.866331436601859e-06, + "loss": 0.88414353, + "num_input_tokens_seen": 25386715, + "step": 1194, + "time_per_iteration": 2.7236063480377197 + }, + { + "auxiliary_loss_clip": 0.01197132, + "auxiliary_loss_mlp": 0.01094327, + "balance_loss_clip": 1.0546155, + "balance_loss_mlp": 1.0109762, + "epoch": 0.1436902543137137, + "flos": 19755394058880.0, + "grad_norm": 2.827905943186121, + "language_loss": 0.73601258, + "learning_rate": 3.866051296464083e-06, + "loss": 0.75892717, + "num_input_tokens_seen": 25405550, + "step": 1195, + "time_per_iteration": 2.7150871753692627 + }, + { + "auxiliary_loss_clip": 0.01194528, + "auxiliary_loss_mlp": 0.00874971, + "balance_loss_clip": 1.05120087, + "balance_loss_mlp": 1.0002507, + "epoch": 0.1438104972043528, + "flos": 14684807669760.0, + "grad_norm": 2.777292777300237, + "language_loss": 0.85054386, + "learning_rate": 3.86577087324781e-06, + "loss": 0.87123883, + "num_input_tokens_seen": 25422040, + "step": 1196, + "time_per_iteration": 2.6113736629486084 + }, + { + "auxiliary_loss_clip": 0.01185657, + "auxiliary_loss_mlp": 0.01093074, + "balance_loss_clip": 1.05365348, + "balance_loss_mlp": 1.00977015, + "epoch": 0.14393074009499188, + "flos": 17092330698240.0, + "grad_norm": 1.921447189091244, + "language_loss": 0.77585578, + "learning_rate": 3.865490166995578e-06, + "loss": 0.79864311, + "num_input_tokens_seen": 25440270, + "step": 1197, + "time_per_iteration": 2.798196315765381 + }, + { + "auxiliary_loss_clip": 0.01185588, + "auxiliary_loss_mlp": 0.01090571, + "balance_loss_clip": 1.05289984, + "balance_loss_mlp": 1.00698149, + "epoch": 0.144050982985631, + "flos": 30476200608000.0, + "grad_norm": 2.3308796447228852, + "language_loss": 0.84249991, + "learning_rate": 3.86520917774997e-06, + "loss": 0.86526144, + "num_input_tokens_seen": 25459705, + "step": 1198, + "time_per_iteration": 2.8246006965637207 + }, + { + "auxiliary_loss_clip": 0.01179933, + "auxiliary_loss_mlp": 0.01091103, + "balance_loss_clip": 1.04998088, + "balance_loss_mlp": 1.00803757, + "epoch": 0.14417122587627007, + "flos": 17858484817920.0, + "grad_norm": 2.1167147527306436, + "language_loss": 0.75312972, + "learning_rate": 3.864927905553614e-06, + "loss": 0.7758401, + "num_input_tokens_seen": 25477615, + "step": 1199, + "time_per_iteration": 2.6592628955841064 + }, + { + "auxiliary_loss_clip": 0.01153247, + "auxiliary_loss_mlp": 0.01090739, + "balance_loss_clip": 1.04635215, + "balance_loss_mlp": 1.00767398, + "epoch": 0.14429146876690915, + "flos": 21613914639360.0, + "grad_norm": 1.8950549533172298, + "language_loss": 0.88617504, + "learning_rate": 3.8646463504491765e-06, + "loss": 0.90861481, + "num_input_tokens_seen": 25497750, + "step": 1200, + "time_per_iteration": 2.8623526096343994 + }, + { + "auxiliary_loss_clip": 0.01186167, + "auxiliary_loss_mlp": 0.01092332, + "balance_loss_clip": 1.0523262, + "balance_loss_mlp": 1.00902843, + "epoch": 0.14441171165754824, + "flos": 23258120722560.0, + "grad_norm": 1.7803048097272103, + "language_loss": 0.83477783, + "learning_rate": 3.8643645124793705e-06, + "loss": 0.8575629, + "num_input_tokens_seen": 25516650, + "step": 1201, + "time_per_iteration": 2.717465877532959 + }, + { + "auxiliary_loss_clip": 0.0118079, + "auxiliary_loss_mlp": 0.01091196, + "balance_loss_clip": 1.04840684, + "balance_loss_mlp": 1.00803542, + "epoch": 0.14453195454818735, + "flos": 42854213963520.0, + "grad_norm": 1.6115259911230635, + "language_loss": 0.74660623, + "learning_rate": 3.8640823916869515e-06, + "loss": 0.76932609, + "num_input_tokens_seen": 25540960, + "step": 1202, + "time_per_iteration": 3.007364273071289 + }, + { + "auxiliary_loss_clip": 0.01194532, + "auxiliary_loss_mlp": 0.01096583, + "balance_loss_clip": 1.05172348, + "balance_loss_mlp": 1.01327991, + "epoch": 0.14465219743882643, + "flos": 27235873774080.0, + "grad_norm": 1.5452972503122389, + "language_loss": 0.78460479, + "learning_rate": 3.863799988114714e-06, + "loss": 0.80751598, + "num_input_tokens_seen": 25562990, + "step": 1203, + "time_per_iteration": 2.7248401641845703 + }, + { + "auxiliary_loss_clip": 0.01193341, + "auxiliary_loss_mlp": 0.01093985, + "balance_loss_clip": 1.05063105, + "balance_loss_mlp": 1.01039577, + "epoch": 0.1447724403294655, + "flos": 16690705752960.0, + "grad_norm": 3.1479414938074646, + "language_loss": 0.70843518, + "learning_rate": 3.863517301805502e-06, + "loss": 0.73130834, + "num_input_tokens_seen": 25581380, + "step": 1204, + "time_per_iteration": 2.7003462314605713 + }, + { + "auxiliary_loss_clip": 0.01154659, + "auxiliary_loss_mlp": 0.01094402, + "balance_loss_clip": 1.04475009, + "balance_loss_mlp": 1.01095581, + "epoch": 0.14489268322010462, + "flos": 20073741321600.0, + "grad_norm": 2.5078493486508164, + "language_loss": 0.97163045, + "learning_rate": 3.863234332802196e-06, + "loss": 0.99412107, + "num_input_tokens_seen": 25593585, + "step": 1205, + "time_per_iteration": 2.6995980739593506 + }, + { + "auxiliary_loss_clip": 0.01177062, + "auxiliary_loss_mlp": 0.01094548, + "balance_loss_clip": 1.05103946, + "balance_loss_mlp": 1.01105356, + "epoch": 0.1450129261107437, + "flos": 27125627955840.0, + "grad_norm": 2.69965840768084, + "language_loss": 0.73993236, + "learning_rate": 3.862951081147723e-06, + "loss": 0.76264846, + "num_input_tokens_seen": 25613750, + "step": 1206, + "time_per_iteration": 2.82450008392334 + }, + { + "auxiliary_loss_clip": 0.01186753, + "auxiliary_loss_mlp": 0.01090434, + "balance_loss_clip": 1.05334485, + "balance_loss_mlp": 1.00755978, + "epoch": 0.1451331690013828, + "flos": 25702344472320.0, + "grad_norm": 2.305230595975195, + "language_loss": 0.78197664, + "learning_rate": 3.862667546885053e-06, + "loss": 0.80474848, + "num_input_tokens_seen": 25632300, + "step": 1207, + "time_per_iteration": 2.784825563430786 + }, + { + "auxiliary_loss_clip": 0.0117367, + "auxiliary_loss_mlp": 0.0109227, + "balance_loss_clip": 1.04840147, + "balance_loss_mlp": 1.00901413, + "epoch": 0.14525341189202187, + "flos": 25737393168000.0, + "grad_norm": 2.031050988118391, + "language_loss": 0.73577178, + "learning_rate": 3.8623837300571965e-06, + "loss": 0.7584312, + "num_input_tokens_seen": 25651285, + "step": 1208, + "time_per_iteration": 2.7869040966033936 + }, + { + "auxiliary_loss_clip": 0.01192336, + "auxiliary_loss_mlp": 0.01091893, + "balance_loss_clip": 1.04949665, + "balance_loss_mlp": 1.00858927, + "epoch": 0.14537365478266098, + "flos": 23073898844160.0, + "grad_norm": 1.7485398334194187, + "language_loss": 0.84220701, + "learning_rate": 3.8620996307072085e-06, + "loss": 0.86504936, + "num_input_tokens_seen": 25671990, + "step": 1209, + "time_per_iteration": 2.770051956176758 + }, + { + "auxiliary_loss_clip": 0.01159568, + "auxiliary_loss_mlp": 0.0109189, + "balance_loss_clip": 1.04693174, + "balance_loss_mlp": 1.00858712, + "epoch": 0.14549389767330007, + "flos": 20595021448320.0, + "grad_norm": 2.064572522470998, + "language_loss": 0.64402092, + "learning_rate": 3.861815248878188e-06, + "loss": 0.66653556, + "num_input_tokens_seen": 25689475, + "step": 1210, + "time_per_iteration": 2.760786294937134 + }, + { + "auxiliary_loss_clip": 0.01167681, + "auxiliary_loss_mlp": 0.01093159, + "balance_loss_clip": 1.04698753, + "balance_loss_mlp": 1.00999892, + "epoch": 0.14561414056393915, + "flos": 15121804533120.0, + "grad_norm": 2.2422697850823847, + "language_loss": 0.79562843, + "learning_rate": 3.861530584613274e-06, + "loss": 0.81823683, + "num_input_tokens_seen": 25707475, + "step": 1211, + "time_per_iteration": 3.669621229171753 + }, + { + "auxiliary_loss_clip": 0.01179156, + "auxiliary_loss_mlp": 0.00874957, + "balance_loss_clip": 1.04740071, + "balance_loss_mlp": 1.0002737, + "epoch": 0.14573438345457826, + "flos": 19427493778560.0, + "grad_norm": 2.368686781803279, + "language_loss": 0.82352328, + "learning_rate": 3.86124563795565e-06, + "loss": 0.84406435, + "num_input_tokens_seen": 25726290, + "step": 1212, + "time_per_iteration": 2.7247507572174072 + }, + { + "auxiliary_loss_clip": 0.01193658, + "auxiliary_loss_mlp": 0.01095589, + "balance_loss_clip": 1.05159104, + "balance_loss_mlp": 1.01247621, + "epoch": 0.14585462634521734, + "flos": 24828422572800.0, + "grad_norm": 1.6028712653412953, + "language_loss": 0.70066893, + "learning_rate": 3.860960408948543e-06, + "loss": 0.72356141, + "num_input_tokens_seen": 25748040, + "step": 1213, + "time_per_iteration": 4.64180064201355 + }, + { + "auxiliary_loss_clip": 0.01177219, + "auxiliary_loss_mlp": 0.01092308, + "balance_loss_clip": 1.04731989, + "balance_loss_mlp": 1.00957704, + "epoch": 0.14597486923585642, + "flos": 15448627405440.0, + "grad_norm": 2.388794123535327, + "language_loss": 0.89803231, + "learning_rate": 3.860674897635222e-06, + "loss": 0.92072761, + "num_input_tokens_seen": 25764525, + "step": 1214, + "time_per_iteration": 2.696077585220337 + }, + { + "auxiliary_loss_clip": 0.01179881, + "auxiliary_loss_mlp": 0.01092648, + "balance_loss_clip": 1.04797566, + "balance_loss_mlp": 1.00958335, + "epoch": 0.1460951121264955, + "flos": 16655154266880.0, + "grad_norm": 2.7643421266442405, + "language_loss": 0.83597714, + "learning_rate": 3.860389104058998e-06, + "loss": 0.85870242, + "num_input_tokens_seen": 25782755, + "step": 1215, + "time_per_iteration": 3.6408233642578125 + }, + { + "auxiliary_loss_clip": 0.01169181, + "auxiliary_loss_mlp": 0.01090875, + "balance_loss_clip": 1.04628944, + "balance_loss_mlp": 1.00761902, + "epoch": 0.14621535501713462, + "flos": 24863291700480.0, + "grad_norm": 2.281470615783427, + "language_loss": 0.72504562, + "learning_rate": 3.860103028263227e-06, + "loss": 0.74764615, + "num_input_tokens_seen": 25805860, + "step": 1216, + "time_per_iteration": 2.8220601081848145 + }, + { + "auxiliary_loss_clip": 0.01151283, + "auxiliary_loss_mlp": 0.01092904, + "balance_loss_clip": 1.0445075, + "balance_loss_mlp": 1.00979114, + "epoch": 0.1463355979077737, + "flos": 25228000442880.0, + "grad_norm": 2.319496131292549, + "language_loss": 0.70436251, + "learning_rate": 3.859816670291304e-06, + "loss": 0.72680438, + "num_input_tokens_seen": 25824955, + "step": 1217, + "time_per_iteration": 2.863795518875122 + }, + { + "auxiliary_loss_clip": 0.01132845, + "auxiliary_loss_mlp": 0.01091595, + "balance_loss_clip": 1.03760338, + "balance_loss_mlp": 1.00843489, + "epoch": 0.14645584079841278, + "flos": 22054143726720.0, + "grad_norm": 2.321052016490813, + "language_loss": 0.9013477, + "learning_rate": 3.859530030186672e-06, + "loss": 0.92359209, + "num_input_tokens_seen": 25841965, + "step": 1218, + "time_per_iteration": 2.850325107574463 + }, + { + "auxiliary_loss_clip": 0.01170765, + "auxiliary_loss_mlp": 0.01092482, + "balance_loss_clip": 1.0475719, + "balance_loss_mlp": 1.00903559, + "epoch": 0.1465760836890519, + "flos": 23623870959360.0, + "grad_norm": 6.0280184179416105, + "language_loss": 0.82690811, + "learning_rate": 3.859243107992813e-06, + "loss": 0.84954059, + "num_input_tokens_seen": 25860770, + "step": 1219, + "time_per_iteration": 2.8440661430358887 + }, + { + "auxiliary_loss_clip": 0.01162109, + "auxiliary_loss_mlp": 0.01093216, + "balance_loss_clip": 1.04441071, + "balance_loss_mlp": 1.00967419, + "epoch": 0.14669632657969098, + "flos": 37407893356800.0, + "grad_norm": 3.420081089919889, + "language_loss": 0.78225017, + "learning_rate": 3.858955903753252e-06, + "loss": 0.80480343, + "num_input_tokens_seen": 25879410, + "step": 1220, + "time_per_iteration": 2.937175989151001 + }, + { + "auxiliary_loss_clip": 0.01184054, + "auxiliary_loss_mlp": 0.01093441, + "balance_loss_clip": 1.05070126, + "balance_loss_mlp": 1.01023245, + "epoch": 0.14681656947033006, + "flos": 28365910623360.0, + "grad_norm": 1.7691809276907444, + "language_loss": 0.83527499, + "learning_rate": 3.858668417511559e-06, + "loss": 0.85804999, + "num_input_tokens_seen": 25902160, + "step": 1221, + "time_per_iteration": 2.850206136703491 + }, + { + "auxiliary_loss_clip": 0.01172644, + "auxiliary_loss_mlp": 0.0109138, + "balance_loss_clip": 1.04853272, + "balance_loss_mlp": 1.00817204, + "epoch": 0.14693681236096917, + "flos": 18479488078080.0, + "grad_norm": 2.125448152502013, + "language_loss": 0.76185739, + "learning_rate": 3.8583806493113445e-06, + "loss": 0.78449762, + "num_input_tokens_seen": 25920505, + "step": 1222, + "time_per_iteration": 2.7671420574188232 + }, + { + "auxiliary_loss_clip": 0.01174381, + "auxiliary_loss_mlp": 0.01090928, + "balance_loss_clip": 1.04456162, + "balance_loss_mlp": 1.00781488, + "epoch": 0.14705705525160825, + "flos": 20777806782720.0, + "grad_norm": 1.9867445968520947, + "language_loss": 0.82057488, + "learning_rate": 3.858092599196263e-06, + "loss": 0.84322798, + "num_input_tokens_seen": 25938460, + "step": 1223, + "time_per_iteration": 2.689441442489624 + }, + { + "auxiliary_loss_clip": 0.01181059, + "auxiliary_loss_mlp": 0.01091749, + "balance_loss_clip": 1.04930735, + "balance_loss_mlp": 1.00849342, + "epoch": 0.14717729814224734, + "flos": 29932944336000.0, + "grad_norm": 2.201741080794397, + "language_loss": 0.82163262, + "learning_rate": 3.857804267210012e-06, + "loss": 0.84436071, + "num_input_tokens_seen": 25957760, + "step": 1224, + "time_per_iteration": 2.8531157970428467 + }, + { + "auxiliary_loss_clip": 0.01153462, + "auxiliary_loss_mlp": 0.0109226, + "balance_loss_clip": 1.04539526, + "balance_loss_mlp": 1.00929046, + "epoch": 0.14729754103288642, + "flos": 20047491457920.0, + "grad_norm": 1.9629131260142376, + "language_loss": 0.88192022, + "learning_rate": 3.857515653396331e-06, + "loss": 0.90437746, + "num_input_tokens_seen": 25974970, + "step": 1225, + "time_per_iteration": 2.7595643997192383 + }, + { + "auxiliary_loss_clip": 0.0114859, + "auxiliary_loss_mlp": 0.01089777, + "balance_loss_clip": 1.04394794, + "balance_loss_mlp": 1.00661635, + "epoch": 0.14741778392352553, + "flos": 19281516906240.0, + "grad_norm": 2.63068017020303, + "language_loss": 0.86558998, + "learning_rate": 3.857226757799002e-06, + "loss": 0.88797367, + "num_input_tokens_seen": 25992525, + "step": 1226, + "time_per_iteration": 2.8233389854431152 + }, + { + "auxiliary_loss_clip": 0.01167649, + "auxiliary_loss_mlp": 0.01094618, + "balance_loss_clip": 1.04649425, + "balance_loss_mlp": 1.01126659, + "epoch": 0.1475380268141646, + "flos": 25411108999680.0, + "grad_norm": 2.2161008178897283, + "language_loss": 0.7435472, + "learning_rate": 3.85693758046185e-06, + "loss": 0.76616979, + "num_input_tokens_seen": 26010815, + "step": 1227, + "time_per_iteration": 2.7715580463409424 + }, + { + "auxiliary_loss_clip": 0.01194833, + "auxiliary_loss_mlp": 0.01091936, + "balance_loss_clip": 1.05267632, + "balance_loss_mlp": 1.00891876, + "epoch": 0.1476582697048037, + "flos": 20847652778880.0, + "grad_norm": 2.1618027341505286, + "language_loss": 0.82797062, + "learning_rate": 3.8566481214287435e-06, + "loss": 0.8508383, + "num_input_tokens_seen": 26028935, + "step": 1228, + "time_per_iteration": 2.7094180583953857 + }, + { + "auxiliary_loss_clip": 0.01160186, + "auxiliary_loss_mlp": 0.01092839, + "balance_loss_clip": 1.04708195, + "balance_loss_mlp": 1.00977397, + "epoch": 0.1477785125954428, + "flos": 14028109269120.0, + "grad_norm": 1.932451676383176, + "language_loss": 0.90654254, + "learning_rate": 3.8563583807435935e-06, + "loss": 0.92907286, + "num_input_tokens_seen": 26045080, + "step": 1229, + "time_per_iteration": 2.7358343601226807 + }, + { + "auxiliary_loss_clip": 0.01181646, + "auxiliary_loss_mlp": 0.00874957, + "balance_loss_clip": 1.04931498, + "balance_loss_mlp": 1.00025868, + "epoch": 0.1478987554860819, + "flos": 20516699842560.0, + "grad_norm": 1.8957392501080665, + "language_loss": 0.77507174, + "learning_rate": 3.856068358450353e-06, + "loss": 0.79563779, + "num_input_tokens_seen": 26065030, + "step": 1230, + "time_per_iteration": 2.7102789878845215 + }, + { + "auxiliary_loss_clip": 0.01162214, + "auxiliary_loss_mlp": 0.01093309, + "balance_loss_clip": 1.04263294, + "balance_loss_mlp": 1.01000595, + "epoch": 0.14801899837672097, + "flos": 17857012360320.0, + "grad_norm": 1.6459144819953098, + "language_loss": 0.85628235, + "learning_rate": 3.8557780545930186e-06, + "loss": 0.87883759, + "num_input_tokens_seen": 26083445, + "step": 1231, + "time_per_iteration": 2.740021228790283 + }, + { + "auxiliary_loss_clip": 0.01166588, + "auxiliary_loss_mlp": 0.01091453, + "balance_loss_clip": 1.04510963, + "balance_loss_mlp": 1.00848341, + "epoch": 0.14813924126736006, + "flos": 20881408584960.0, + "grad_norm": 1.660330056313795, + "language_loss": 0.79407978, + "learning_rate": 3.855487469215628e-06, + "loss": 0.81666023, + "num_input_tokens_seen": 26102375, + "step": 1232, + "time_per_iteration": 2.7333595752716064 + }, + { + "auxiliary_loss_clip": 0.01159758, + "auxiliary_loss_mlp": 0.01092188, + "balance_loss_clip": 1.04692841, + "balance_loss_mlp": 1.00921869, + "epoch": 0.14825948415799917, + "flos": 37414070496000.0, + "grad_norm": 2.5138353161314972, + "language_loss": 0.72161072, + "learning_rate": 3.855196602362264e-06, + "loss": 0.74413025, + "num_input_tokens_seen": 26125295, + "step": 1233, + "time_per_iteration": 2.9508602619171143 + }, + { + "auxiliary_loss_clip": 0.01181298, + "auxiliary_loss_mlp": 0.01093791, + "balance_loss_clip": 1.04868877, + "balance_loss_mlp": 1.01053512, + "epoch": 0.14837972704863825, + "flos": 22014641744640.0, + "grad_norm": 2.095557773240989, + "language_loss": 0.94509673, + "learning_rate": 3.854905454077051e-06, + "loss": 0.96784765, + "num_input_tokens_seen": 26142905, + "step": 1234, + "time_per_iteration": 2.672490119934082 + }, + { + "auxiliary_loss_clip": 0.01125625, + "auxiliary_loss_mlp": 0.01091895, + "balance_loss_clip": 1.03862917, + "balance_loss_mlp": 1.00882983, + "epoch": 0.14849996993927733, + "flos": 20996323171200.0, + "grad_norm": 2.212787229085263, + "language_loss": 0.87958354, + "learning_rate": 3.854614024404155e-06, + "loss": 0.90175879, + "num_input_tokens_seen": 26161215, + "step": 1235, + "time_per_iteration": 3.004387855529785 + }, + { + "auxiliary_loss_clip": 0.01166776, + "auxiliary_loss_mlp": 0.01092912, + "balance_loss_clip": 1.04811549, + "balance_loss_mlp": 1.00970411, + "epoch": 0.14862021282991644, + "flos": 20047994248320.0, + "grad_norm": 1.8197906975578777, + "language_loss": 0.89323366, + "learning_rate": 3.8543223133877865e-06, + "loss": 0.91583049, + "num_input_tokens_seen": 26179810, + "step": 1236, + "time_per_iteration": 2.795999765396118 + }, + { + "auxiliary_loss_clip": 0.011661, + "auxiliary_loss_mlp": 0.01093605, + "balance_loss_clip": 1.04760873, + "balance_loss_mlp": 1.01020622, + "epoch": 0.14874045572055553, + "flos": 22712027276160.0, + "grad_norm": 1.7486302371261424, + "language_loss": 0.88295984, + "learning_rate": 3.854030321072198e-06, + "loss": 0.90555686, + "num_input_tokens_seen": 26199715, + "step": 1237, + "time_per_iteration": 4.653550386428833 + }, + { + "auxiliary_loss_clip": 0.01163165, + "auxiliary_loss_mlp": 0.01092781, + "balance_loss_clip": 1.04652297, + "balance_loss_mlp": 1.0096209, + "epoch": 0.1488606986111946, + "flos": 25411288567680.0, + "grad_norm": 1.8735081353823027, + "language_loss": 0.73340636, + "learning_rate": 3.853738047501682e-06, + "loss": 0.75596583, + "num_input_tokens_seen": 26220275, + "step": 1238, + "time_per_iteration": 2.8875203132629395 + }, + { + "auxiliary_loss_clip": 0.01180634, + "auxiliary_loss_mlp": 0.01094286, + "balance_loss_clip": 1.0490613, + "balance_loss_mlp": 1.01098251, + "epoch": 0.1489809415018337, + "flos": 17018749687680.0, + "grad_norm": 1.7603365600523824, + "language_loss": 0.77633417, + "learning_rate": 3.85344549272058e-06, + "loss": 0.79908335, + "num_input_tokens_seen": 26238255, + "step": 1239, + "time_per_iteration": 3.7802717685699463 + }, + { + "auxiliary_loss_clip": 0.01183536, + "auxiliary_loss_mlp": 0.01093639, + "balance_loss_clip": 1.049927, + "balance_loss_mlp": 1.01033556, + "epoch": 0.1491011843924728, + "flos": 33659394860160.0, + "grad_norm": 1.777234158254082, + "language_loss": 0.82591307, + "learning_rate": 3.853152656773269e-06, + "loss": 0.84868479, + "num_input_tokens_seen": 26259690, + "step": 1240, + "time_per_iteration": 2.89993953704834 + }, + { + "auxiliary_loss_clip": 0.01167049, + "auxiliary_loss_mlp": 0.01091113, + "balance_loss_clip": 1.04572797, + "balance_loss_mlp": 1.00804794, + "epoch": 0.14922142728311188, + "flos": 21179000764800.0, + "grad_norm": 1.6400260520655074, + "language_loss": 0.85142553, + "learning_rate": 3.852859539704174e-06, + "loss": 0.87400723, + "num_input_tokens_seen": 26278990, + "step": 1241, + "time_per_iteration": 3.7169580459594727 + }, + { + "auxiliary_loss_clip": 0.01138157, + "auxiliary_loss_mlp": 0.01094259, + "balance_loss_clip": 1.03899479, + "balance_loss_mlp": 1.01086009, + "epoch": 0.14934167017375097, + "flos": 29860548474240.0, + "grad_norm": 6.460336695751088, + "language_loss": 0.76478982, + "learning_rate": 3.85256614155776e-06, + "loss": 0.78711396, + "num_input_tokens_seen": 26299120, + "step": 1242, + "time_per_iteration": 2.917480945587158 + }, + { + "auxiliary_loss_clip": 0.01175971, + "auxiliary_loss_mlp": 0.01090951, + "balance_loss_clip": 1.04679513, + "balance_loss_mlp": 1.00783837, + "epoch": 0.14946191306439008, + "flos": 17019216564480.0, + "grad_norm": 3.1341239177701294, + "language_loss": 0.74163449, + "learning_rate": 3.852272462378535e-06, + "loss": 0.76430368, + "num_input_tokens_seen": 26316995, + "step": 1243, + "time_per_iteration": 2.689990520477295 + }, + { + "auxiliary_loss_clip": 0.01166697, + "auxiliary_loss_mlp": 0.01089653, + "balance_loss_clip": 1.04433703, + "balance_loss_mlp": 1.00644517, + "epoch": 0.14958215595502916, + "flos": 15669047214720.0, + "grad_norm": 2.389657153158627, + "language_loss": 0.7790029, + "learning_rate": 3.85197850221105e-06, + "loss": 0.80156648, + "num_input_tokens_seen": 26333295, + "step": 1244, + "time_per_iteration": 2.8094940185546875 + }, + { + "auxiliary_loss_clip": 0.01179833, + "auxiliary_loss_mlp": 0.01091725, + "balance_loss_clip": 1.04840386, + "balance_loss_mlp": 1.0088985, + "epoch": 0.14970239884566824, + "flos": 33108560818560.0, + "grad_norm": 1.7543475329780853, + "language_loss": 0.75911933, + "learning_rate": 3.851684261099899e-06, + "loss": 0.7818349, + "num_input_tokens_seen": 26355035, + "step": 1245, + "time_per_iteration": 2.8360514640808105 + }, + { + "auxiliary_loss_clip": 0.01174042, + "auxiliary_loss_mlp": 0.01092233, + "balance_loss_clip": 1.04817057, + "balance_loss_mlp": 1.00878668, + "epoch": 0.14982264173630733, + "flos": 17821245392640.0, + "grad_norm": 2.429601155887999, + "language_loss": 0.86618352, + "learning_rate": 3.851389739089718e-06, + "loss": 0.88884622, + "num_input_tokens_seen": 26371655, + "step": 1246, + "time_per_iteration": 2.7621567249298096 + }, + { + "auxiliary_loss_clip": 0.01180333, + "auxiliary_loss_mlp": 0.01094157, + "balance_loss_clip": 1.04966366, + "balance_loss_mlp": 1.01080561, + "epoch": 0.14994288462694644, + "flos": 32409559175040.0, + "grad_norm": 1.849251345607677, + "language_loss": 0.80498946, + "learning_rate": 3.851094936225186e-06, + "loss": 0.82773435, + "num_input_tokens_seen": 26392540, + "step": 1247, + "time_per_iteration": 2.7898507118225098 + }, + { + "auxiliary_loss_clip": 0.01162904, + "auxiliary_loss_mlp": 0.01092368, + "balance_loss_clip": 1.04327238, + "balance_loss_mlp": 1.00930285, + "epoch": 0.15006312751758552, + "flos": 31794661226880.0, + "grad_norm": 2.602296216119858, + "language_loss": 0.76553869, + "learning_rate": 3.850799852551024e-06, + "loss": 0.78809136, + "num_input_tokens_seen": 26414960, + "step": 1248, + "time_per_iteration": 2.893909454345703 + }, + { + "auxiliary_loss_clip": 0.0118397, + "auxiliary_loss_mlp": 0.01091669, + "balance_loss_clip": 1.0510782, + "balance_loss_mlp": 1.00855672, + "epoch": 0.1501833704082246, + "flos": 16618022582400.0, + "grad_norm": 2.360907160002483, + "language_loss": 0.86264575, + "learning_rate": 3.850504488111995e-06, + "loss": 0.88540208, + "num_input_tokens_seen": 26431635, + "step": 1249, + "time_per_iteration": 2.782651662826538 + }, + { + "auxiliary_loss_clip": 0.01158699, + "auxiliary_loss_mlp": 0.01090837, + "balance_loss_clip": 1.04524803, + "balance_loss_mlp": 1.00810623, + "epoch": 0.15030361329886371, + "flos": 23471178243840.0, + "grad_norm": 2.0428376739442484, + "language_loss": 0.82439721, + "learning_rate": 3.850208842952907e-06, + "loss": 0.84689254, + "num_input_tokens_seen": 26450440, + "step": 1250, + "time_per_iteration": 2.814279079437256 + }, + { + "auxiliary_loss_clip": 0.01144323, + "auxiliary_loss_mlp": 0.01091734, + "balance_loss_clip": 1.03919113, + "balance_loss_mlp": 1.00843084, + "epoch": 0.1504238561895028, + "flos": 25629409906560.0, + "grad_norm": 1.6802247165890918, + "language_loss": 0.79251885, + "learning_rate": 3.849912917118608e-06, + "loss": 0.81487942, + "num_input_tokens_seen": 26471480, + "step": 1251, + "time_per_iteration": 3.0507590770721436 + }, + { + "auxiliary_loss_clip": 0.01198051, + "auxiliary_loss_mlp": 0.01080391, + "balance_loss_clip": 1.07693803, + "balance_loss_mlp": 0.99990141, + "epoch": 0.15054409908014188, + "flos": 52095146129280.0, + "grad_norm": 0.8776225766674179, + "language_loss": 0.59237409, + "learning_rate": 3.849616710653992e-06, + "loss": 0.61515856, + "num_input_tokens_seen": 26532950, + "step": 1252, + "time_per_iteration": 3.2814857959747314 + }, + { + "auxiliary_loss_clip": 0.01177761, + "auxiliary_loss_mlp": 0.01090915, + "balance_loss_clip": 1.0464766, + "balance_loss_mlp": 1.00789762, + "epoch": 0.150664341970781, + "flos": 18880251096960.0, + "grad_norm": 1.6004021489925973, + "language_loss": 0.7534014, + "learning_rate": 3.84932022360399e-06, + "loss": 0.77608812, + "num_input_tokens_seen": 26551615, + "step": 1253, + "time_per_iteration": 2.7923572063446045 + }, + { + "auxiliary_loss_clip": 0.01168138, + "auxiliary_loss_mlp": 0.01091184, + "balance_loss_clip": 1.04772031, + "balance_loss_mlp": 1.00821424, + "epoch": 0.15078458486142007, + "flos": 22163240309760.0, + "grad_norm": 2.97761974421163, + "language_loss": 0.84660202, + "learning_rate": 3.849023456013581e-06, + "loss": 0.86919528, + "num_input_tokens_seen": 26569175, + "step": 1254, + "time_per_iteration": 2.8500592708587646 + }, + { + "auxiliary_loss_clip": 0.01180455, + "auxiliary_loss_mlp": 0.01091618, + "balance_loss_clip": 1.04757798, + "balance_loss_mlp": 1.00850534, + "epoch": 0.15090482775205916, + "flos": 26651894457600.0, + "grad_norm": 2.212409451905642, + "language_loss": 0.62347162, + "learning_rate": 3.848726407927784e-06, + "loss": 0.64619243, + "num_input_tokens_seen": 26589560, + "step": 1255, + "time_per_iteration": 2.9227375984191895 + }, + { + "auxiliary_loss_clip": 0.01167468, + "auxiliary_loss_mlp": 0.01090196, + "balance_loss_clip": 1.04571819, + "balance_loss_mlp": 1.00717878, + "epoch": 0.15102507064269824, + "flos": 21798998444160.0, + "grad_norm": 3.7647031605666093, + "language_loss": 0.86472946, + "learning_rate": 3.84842907939166e-06, + "loss": 0.88730609, + "num_input_tokens_seen": 26608785, + "step": 1256, + "time_per_iteration": 2.8635406494140625 + }, + { + "auxiliary_loss_clip": 0.01158686, + "auxiliary_loss_mlp": 0.01091015, + "balance_loss_clip": 1.04498112, + "balance_loss_mlp": 1.00823593, + "epoch": 0.15114531353333735, + "flos": 22820908377600.0, + "grad_norm": 2.976129407804033, + "language_loss": 0.71213222, + "learning_rate": 3.8481314704503146e-06, + "loss": 0.73462927, + "num_input_tokens_seen": 26628615, + "step": 1257, + "time_per_iteration": 2.8162920475006104 + }, + { + "auxiliary_loss_clip": 0.01182749, + "auxiliary_loss_mlp": 0.01092158, + "balance_loss_clip": 1.05143118, + "balance_loss_mlp": 1.00923574, + "epoch": 0.15126555642397643, + "flos": 19682674974720.0, + "grad_norm": 2.2918877432952134, + "language_loss": 0.88207901, + "learning_rate": 3.847833581148895e-06, + "loss": 0.90482807, + "num_input_tokens_seen": 26647525, + "step": 1258, + "time_per_iteration": 2.7603275775909424 + }, + { + "auxiliary_loss_clip": 0.01186783, + "auxiliary_loss_mlp": 0.01090633, + "balance_loss_clip": 1.04615021, + "balance_loss_mlp": 1.00742531, + "epoch": 0.15138579931461552, + "flos": 28726022424960.0, + "grad_norm": 1.909420374658142, + "language_loss": 0.81429946, + "learning_rate": 3.84753541153259e-06, + "loss": 0.83707362, + "num_input_tokens_seen": 26667095, + "step": 1259, + "time_per_iteration": 2.6915297508239746 + }, + { + "auxiliary_loss_clip": 0.01181274, + "auxiliary_loss_mlp": 0.01090964, + "balance_loss_clip": 1.04957235, + "balance_loss_mlp": 1.00789905, + "epoch": 0.15150604220525463, + "flos": 22127006465280.0, + "grad_norm": 1.5404371998693405, + "language_loss": 0.83184558, + "learning_rate": 3.847236961646633e-06, + "loss": 0.854568, + "num_input_tokens_seen": 26686075, + "step": 1260, + "time_per_iteration": 2.773358106613159 + }, + { + "auxiliary_loss_clip": 0.01165282, + "auxiliary_loss_mlp": 0.01092104, + "balance_loss_clip": 1.04767442, + "balance_loss_mlp": 1.00913405, + "epoch": 0.1516262850958937, + "flos": 12968708515200.0, + "grad_norm": 2.9129518721770746, + "language_loss": 0.78406131, + "learning_rate": 3.846938231536296e-06, + "loss": 0.80663514, + "num_input_tokens_seen": 26701695, + "step": 1261, + "time_per_iteration": 2.745393991470337 + }, + { + "auxiliary_loss_clip": 0.01182106, + "auxiliary_loss_mlp": 0.0109117, + "balance_loss_clip": 1.05028605, + "balance_loss_mlp": 1.00853372, + "epoch": 0.1517465279865328, + "flos": 21797130936960.0, + "grad_norm": 3.3639134562399837, + "language_loss": 0.81028843, + "learning_rate": 3.8466392212468995e-06, + "loss": 0.83302122, + "num_input_tokens_seen": 26721885, + "step": 1262, + "time_per_iteration": 2.7303218841552734 + }, + { + "auxiliary_loss_clip": 0.0118341, + "auxiliary_loss_mlp": 0.01080738, + "balance_loss_clip": 1.07828736, + "balance_loss_mlp": 1.00024831, + "epoch": 0.15186677087717187, + "flos": 58174569901440.0, + "grad_norm": 0.8183872352252163, + "language_loss": 0.61949253, + "learning_rate": 3.8463399308238e-06, + "loss": 0.64213407, + "num_input_tokens_seen": 26780990, + "step": 1263, + "time_per_iteration": 5.126190185546875 + }, + { + "auxiliary_loss_clip": 0.01180106, + "auxiliary_loss_mlp": 0.01091471, + "balance_loss_clip": 1.04858398, + "balance_loss_mlp": 1.00854874, + "epoch": 0.15198701376781099, + "flos": 32669696448000.0, + "grad_norm": 1.5894975340241007, + "language_loss": 0.6415047, + "learning_rate": 3.846040360312402e-06, + "loss": 0.66422051, + "num_input_tokens_seen": 26804250, + "step": 1264, + "time_per_iteration": 3.873121976852417 + }, + { + "auxiliary_loss_clip": 0.01187946, + "auxiliary_loss_mlp": 0.01089712, + "balance_loss_clip": 1.04747629, + "balance_loss_mlp": 1.00674272, + "epoch": 0.15210725665845007, + "flos": 28402575431040.0, + "grad_norm": 2.0469112384535313, + "language_loss": 0.81126761, + "learning_rate": 3.8457405097581485e-06, + "loss": 0.83404422, + "num_input_tokens_seen": 26823240, + "step": 1265, + "time_per_iteration": 2.815894365310669 + }, + { + "auxiliary_loss_clip": 0.01146867, + "auxiliary_loss_mlp": 0.01091637, + "balance_loss_clip": 1.04357886, + "balance_loss_mlp": 1.00847626, + "epoch": 0.15222749954908915, + "flos": 19938179393280.0, + "grad_norm": 1.8335879601571203, + "language_loss": 0.78153229, + "learning_rate": 3.8454403792065275e-06, + "loss": 0.80391729, + "num_input_tokens_seen": 26842060, + "step": 1266, + "time_per_iteration": 3.791701078414917 + }, + { + "auxiliary_loss_clip": 0.01152296, + "auxiliary_loss_mlp": 0.0109173, + "balance_loss_clip": 1.04451251, + "balance_loss_mlp": 1.00885546, + "epoch": 0.15234774243972826, + "flos": 21324223451520.0, + "grad_norm": 2.044448188431349, + "language_loss": 0.85740268, + "learning_rate": 3.845139968703068e-06, + "loss": 0.879843, + "num_input_tokens_seen": 26859580, + "step": 1267, + "time_per_iteration": 2.8074424266815186 + }, + { + "auxiliary_loss_clip": 0.01141231, + "auxiliary_loss_mlp": 0.01092393, + "balance_loss_clip": 1.04141903, + "balance_loss_mlp": 1.00928044, + "epoch": 0.15246798533036734, + "flos": 25957812977280.0, + "grad_norm": 1.671436827441162, + "language_loss": 0.83171988, + "learning_rate": 3.844839278293342e-06, + "loss": 0.85405612, + "num_input_tokens_seen": 26880430, + "step": 1268, + "time_per_iteration": 2.8536460399627686 + }, + { + "auxiliary_loss_clip": 0.01189471, + "auxiliary_loss_mlp": 0.01090605, + "balance_loss_clip": 1.04959464, + "balance_loss_mlp": 1.00763559, + "epoch": 0.15258822822100643, + "flos": 25811907932160.0, + "grad_norm": 5.926249745163282, + "language_loss": 0.76649892, + "learning_rate": 3.8445383080229654e-06, + "loss": 0.78929973, + "num_input_tokens_seen": 26896445, + "step": 1269, + "time_per_iteration": 2.716844320297241 + }, + { + "auxiliary_loss_clip": 0.01169753, + "auxiliary_loss_mlp": 0.01093629, + "balance_loss_clip": 1.046808, + "balance_loss_mlp": 1.01061177, + "epoch": 0.1527084711116455, + "flos": 25265455349760.0, + "grad_norm": 1.9921856621838439, + "language_loss": 0.74019587, + "learning_rate": 3.844237057937593e-06, + "loss": 0.76282978, + "num_input_tokens_seen": 26915450, + "step": 1270, + "time_per_iteration": 2.7653005123138428 + }, + { + "auxiliary_loss_clip": 0.01177135, + "auxiliary_loss_mlp": 0.01091763, + "balance_loss_clip": 1.04517102, + "balance_loss_mlp": 1.00874567, + "epoch": 0.15282871400228462, + "flos": 29240227572480.0, + "grad_norm": 2.3586527129626575, + "language_loss": 0.78138083, + "learning_rate": 3.843935528082926e-06, + "loss": 0.80406982, + "num_input_tokens_seen": 26936475, + "step": 1271, + "time_per_iteration": 2.8204751014709473 + }, + { + "auxiliary_loss_clip": 0.01178465, + "auxiliary_loss_mlp": 0.01091833, + "balance_loss_clip": 1.04708028, + "balance_loss_mlp": 1.00891113, + "epoch": 0.1529489568929237, + "flos": 20882952869760.0, + "grad_norm": 1.8555266614271955, + "language_loss": 0.85297108, + "learning_rate": 3.843633718504704e-06, + "loss": 0.87567401, + "num_input_tokens_seen": 26954920, + "step": 1272, + "time_per_iteration": 2.7149813175201416 + }, + { + "auxiliary_loss_clip": 0.01150767, + "auxiliary_loss_mlp": 0.01091437, + "balance_loss_clip": 1.04021871, + "balance_loss_mlp": 1.00856304, + "epoch": 0.1530691997835628, + "flos": 20083833043200.0, + "grad_norm": 2.852770482597366, + "language_loss": 0.90205091, + "learning_rate": 3.843331629248715e-06, + "loss": 0.92447293, + "num_input_tokens_seen": 26972520, + "step": 1273, + "time_per_iteration": 2.7974400520324707 + }, + { + "auxiliary_loss_clip": 0.01188813, + "auxiliary_loss_mlp": 0.01095141, + "balance_loss_clip": 1.04881907, + "balance_loss_mlp": 1.01212323, + "epoch": 0.1531894426742019, + "flos": 28759814144640.0, + "grad_norm": 2.5677891708921057, + "language_loss": 0.76783264, + "learning_rate": 3.843029260360782e-06, + "loss": 0.79067218, + "num_input_tokens_seen": 26990890, + "step": 1274, + "time_per_iteration": 2.7704265117645264 + }, + { + "auxiliary_loss_clip": 0.01177466, + "auxiliary_loss_mlp": 0.01090858, + "balance_loss_clip": 1.04681408, + "balance_loss_mlp": 1.00807953, + "epoch": 0.15330968556484098, + "flos": 22236282616320.0, + "grad_norm": 1.8605245124967908, + "language_loss": 0.78962457, + "learning_rate": 3.8427266118867755e-06, + "loss": 0.81230783, + "num_input_tokens_seen": 27010640, + "step": 1275, + "time_per_iteration": 2.804121971130371 + }, + { + "auxiliary_loss_clip": 0.01163228, + "auxiliary_loss_mlp": 0.01089624, + "balance_loss_clip": 1.04340219, + "balance_loss_mlp": 1.00670207, + "epoch": 0.15342992845548006, + "flos": 27527504296320.0, + "grad_norm": 1.9335878328047684, + "language_loss": 0.82944173, + "learning_rate": 3.842423683872608e-06, + "loss": 0.85197026, + "num_input_tokens_seen": 27031215, + "step": 1276, + "time_per_iteration": 2.836653709411621 + }, + { + "auxiliary_loss_clip": 0.01177141, + "auxiliary_loss_mlp": 0.01092238, + "balance_loss_clip": 1.04889309, + "balance_loss_mlp": 1.00922036, + "epoch": 0.15355017134611917, + "flos": 19609596754560.0, + "grad_norm": 2.248879595237247, + "language_loss": 0.77892119, + "learning_rate": 3.842120476364232e-06, + "loss": 0.801615, + "num_input_tokens_seen": 27049665, + "step": 1277, + "time_per_iteration": 2.7219762802124023 + }, + { + "auxiliary_loss_clip": 0.01180013, + "auxiliary_loss_mlp": 0.01093324, + "balance_loss_clip": 1.04823625, + "balance_loss_mlp": 1.01016355, + "epoch": 0.15367041423675826, + "flos": 18478590238080.0, + "grad_norm": 1.8778286226069507, + "language_loss": 0.83698702, + "learning_rate": 3.841816989407644e-06, + "loss": 0.85972047, + "num_input_tokens_seen": 27065155, + "step": 1278, + "time_per_iteration": 2.794553279876709 + }, + { + "auxiliary_loss_clip": 0.01151243, + "auxiliary_loss_mlp": 0.01093046, + "balance_loss_clip": 1.04199004, + "balance_loss_mlp": 1.01021981, + "epoch": 0.15379065712739734, + "flos": 41427662342400.0, + "grad_norm": 1.9430972903834294, + "language_loss": 0.76916635, + "learning_rate": 3.841513223048884e-06, + "loss": 0.79160923, + "num_input_tokens_seen": 27085840, + "step": 1279, + "time_per_iteration": 2.9656598567962646 + }, + { + "auxiliary_loss_clip": 0.01148946, + "auxiliary_loss_mlp": 0.01092111, + "balance_loss_clip": 1.04315734, + "balance_loss_mlp": 1.00885487, + "epoch": 0.15391090001803642, + "flos": 22054215553920.0, + "grad_norm": 2.16418276869218, + "language_loss": 0.78742504, + "learning_rate": 3.841209177334031e-06, + "loss": 0.80983555, + "num_input_tokens_seen": 27104200, + "step": 1280, + "time_per_iteration": 2.8373730182647705 + }, + { + "auxiliary_loss_clip": 0.01174425, + "auxiliary_loss_mlp": 0.01090377, + "balance_loss_clip": 1.04485023, + "balance_loss_mlp": 1.00755024, + "epoch": 0.15403114290867553, + "flos": 15450351258240.0, + "grad_norm": 1.7601114613588327, + "language_loss": 0.74783069, + "learning_rate": 3.84090485230921e-06, + "loss": 0.77047873, + "num_input_tokens_seen": 27122440, + "step": 1281, + "time_per_iteration": 2.728334665298462 + }, + { + "auxiliary_loss_clip": 0.01186965, + "auxiliary_loss_mlp": 0.01089954, + "balance_loss_clip": 1.04784942, + "balance_loss_mlp": 1.00712705, + "epoch": 0.15415138579931462, + "flos": 17929156826880.0, + "grad_norm": 2.5314166229825057, + "language_loss": 0.76741827, + "learning_rate": 3.840600248020588e-06, + "loss": 0.79018748, + "num_input_tokens_seen": 27139380, + "step": 1282, + "time_per_iteration": 2.645571231842041 + }, + { + "auxiliary_loss_clip": 0.01171023, + "auxiliary_loss_mlp": 0.01092591, + "balance_loss_clip": 1.04684031, + "balance_loss_mlp": 1.00943089, + "epoch": 0.1542716286899537, + "flos": 11429325296640.0, + "grad_norm": 1.9979755717325602, + "language_loss": 0.79952288, + "learning_rate": 3.840295364514371e-06, + "loss": 0.82215905, + "num_input_tokens_seen": 27156760, + "step": 1283, + "time_per_iteration": 2.7438199520111084 + }, + { + "auxiliary_loss_clip": 0.01167213, + "auxiliary_loss_mlp": 0.01091861, + "balance_loss_clip": 1.04641521, + "balance_loss_mlp": 1.00903475, + "epoch": 0.1543918715805928, + "flos": 17420338719360.0, + "grad_norm": 2.4780859846944097, + "language_loss": 0.7877394, + "learning_rate": 3.83999020183681e-06, + "loss": 0.81033015, + "num_input_tokens_seen": 27175455, + "step": 1284, + "time_per_iteration": 2.7557928562164307 + }, + { + "auxiliary_loss_clip": 0.01124827, + "auxiliary_loss_mlp": 0.01093143, + "balance_loss_clip": 1.03756917, + "balance_loss_mlp": 1.00983989, + "epoch": 0.1545121144712319, + "flos": 17786376264960.0, + "grad_norm": 1.8037347328014428, + "language_loss": 0.78760254, + "learning_rate": 3.839684760034199e-06, + "loss": 0.80978215, + "num_input_tokens_seen": 27193660, + "step": 1285, + "time_per_iteration": 2.8973727226257324 + }, + { + "auxiliary_loss_clip": 0.01158095, + "auxiliary_loss_mlp": 0.01091497, + "balance_loss_clip": 1.04535449, + "balance_loss_mlp": 1.0085746, + "epoch": 0.15463235736187098, + "flos": 28220185146240.0, + "grad_norm": 2.087013289361297, + "language_loss": 0.65187842, + "learning_rate": 3.8393790391528716e-06, + "loss": 0.67437434, + "num_input_tokens_seen": 27214355, + "step": 1286, + "time_per_iteration": 2.8289341926574707 + }, + { + "auxiliary_loss_clip": 0.01162789, + "auxiliary_loss_mlp": 0.01092104, + "balance_loss_clip": 1.0423913, + "balance_loss_mlp": 1.0092299, + "epoch": 0.15475260025251006, + "flos": 22856890826880.0, + "grad_norm": 1.8575262895882236, + "language_loss": 0.89251399, + "learning_rate": 3.8390730392392075e-06, + "loss": 0.9150629, + "num_input_tokens_seen": 27234335, + "step": 1287, + "time_per_iteration": 3.6394383907318115 + }, + { + "auxiliary_loss_clip": 0.01187001, + "auxiliary_loss_mlp": 0.01091742, + "balance_loss_clip": 1.04754353, + "balance_loss_mlp": 1.00877202, + "epoch": 0.15487284314314917, + "flos": 17602872658560.0, + "grad_norm": 2.2002462908706533, + "language_loss": 0.79577631, + "learning_rate": 3.838766760339626e-06, + "loss": 0.8185637, + "num_input_tokens_seen": 27252860, + "step": 1288, + "time_per_iteration": 3.658900022506714 + }, + { + "auxiliary_loss_clip": 0.01149496, + "auxiliary_loss_mlp": 0.01089327, + "balance_loss_clip": 1.04187012, + "balance_loss_mlp": 1.00650048, + "epoch": 0.15499308603378825, + "flos": 20082037363200.0, + "grad_norm": 2.858818593043919, + "language_loss": 0.79700184, + "learning_rate": 3.838460202500587e-06, + "loss": 0.81939006, + "num_input_tokens_seen": 27268650, + "step": 1289, + "time_per_iteration": 3.6937448978424072 + }, + { + "auxiliary_loss_clip": 0.01141272, + "auxiliary_loss_mlp": 0.01090502, + "balance_loss_clip": 1.03694677, + "balance_loss_mlp": 1.00734162, + "epoch": 0.15511332892442733, + "flos": 15918051271680.0, + "grad_norm": 1.9990653484507805, + "language_loss": 0.74232727, + "learning_rate": 3.838153365768599e-06, + "loss": 0.76464504, + "num_input_tokens_seen": 27285160, + "step": 1290, + "time_per_iteration": 2.796618938446045 + }, + { + "auxiliary_loss_clip": 0.01146812, + "auxiliary_loss_mlp": 0.01096148, + "balance_loss_clip": 1.04339528, + "balance_loss_mlp": 1.01289272, + "epoch": 0.15523357181506645, + "flos": 41282475569280.0, + "grad_norm": 3.0339032767922074, + "language_loss": 0.75297332, + "learning_rate": 3.837846250190206e-06, + "loss": 0.77540296, + "num_input_tokens_seen": 27308025, + "step": 1291, + "time_per_iteration": 3.9184837341308594 + }, + { + "auxiliary_loss_clip": 0.01153091, + "auxiliary_loss_mlp": 0.00874739, + "balance_loss_clip": 1.04710913, + "balance_loss_mlp": 1.00016975, + "epoch": 0.15535381470570553, + "flos": 18478769806080.0, + "grad_norm": 2.8159537645439485, + "language_loss": 0.76863009, + "learning_rate": 3.837538855811998e-06, + "loss": 0.78890836, + "num_input_tokens_seen": 27326200, + "step": 1292, + "time_per_iteration": 2.8875842094421387 + }, + { + "auxiliary_loss_clip": 0.01169085, + "auxiliary_loss_mlp": 0.01092593, + "balance_loss_clip": 1.0476613, + "balance_loss_mlp": 1.00952768, + "epoch": 0.1554740575963446, + "flos": 13918150759680.0, + "grad_norm": 2.0900400839673003, + "language_loss": 0.71379423, + "learning_rate": 3.837231182680606e-06, + "loss": 0.73641109, + "num_input_tokens_seen": 27344165, + "step": 1293, + "time_per_iteration": 2.7943949699401855 + }, + { + "auxiliary_loss_clip": 0.01175726, + "auxiliary_loss_mlp": 0.01092372, + "balance_loss_clip": 1.04472709, + "balance_loss_mlp": 1.00940275, + "epoch": 0.1555943004869837, + "flos": 20847078161280.0, + "grad_norm": 1.664230933343322, + "language_loss": 0.76224929, + "learning_rate": 3.836923230842706e-06, + "loss": 0.78493035, + "num_input_tokens_seen": 27363280, + "step": 1294, + "time_per_iteration": 2.7967710494995117 + }, + { + "auxiliary_loss_clip": 0.0115076, + "auxiliary_loss_mlp": 0.01091948, + "balance_loss_clip": 1.04584336, + "balance_loss_mlp": 1.00878811, + "epoch": 0.1557145433776228, + "flos": 22085888371200.0, + "grad_norm": 4.331380474865098, + "language_loss": 0.80721784, + "learning_rate": 3.836615000345011e-06, + "loss": 0.82964498, + "num_input_tokens_seen": 27381460, + "step": 1295, + "time_per_iteration": 2.8637404441833496 + }, + { + "auxiliary_loss_clip": 0.01187632, + "auxiliary_loss_mlp": 0.01091362, + "balance_loss_clip": 1.04879332, + "balance_loss_mlp": 1.00848806, + "epoch": 0.1558347862682619, + "flos": 19791987039360.0, + "grad_norm": 2.0741675288447534, + "language_loss": 0.77970803, + "learning_rate": 3.836306491234282e-06, + "loss": 0.80249804, + "num_input_tokens_seen": 27399310, + "step": 1296, + "time_per_iteration": 2.714890718460083 + }, + { + "auxiliary_loss_clip": 0.01158314, + "auxiliary_loss_mlp": 0.01092422, + "balance_loss_clip": 1.04169738, + "balance_loss_mlp": 1.00959492, + "epoch": 0.15595502915890097, + "flos": 17237086508160.0, + "grad_norm": 2.227683871770242, + "language_loss": 0.75300425, + "learning_rate": 3.835997703557317e-06, + "loss": 0.77551162, + "num_input_tokens_seen": 27416050, + "step": 1297, + "time_per_iteration": 2.7255654335021973 + }, + { + "auxiliary_loss_clip": 0.01149057, + "auxiliary_loss_mlp": 0.01089293, + "balance_loss_clip": 1.0451324, + "balance_loss_mlp": 1.00656164, + "epoch": 0.15607527204954008, + "flos": 19719519350400.0, + "grad_norm": 1.6704690769297235, + "language_loss": 0.79996645, + "learning_rate": 3.83568863736096e-06, + "loss": 0.82234997, + "num_input_tokens_seen": 27434920, + "step": 1298, + "time_per_iteration": 2.9156436920166016 + }, + { + "auxiliary_loss_clip": 0.01160051, + "auxiliary_loss_mlp": 0.01092558, + "balance_loss_clip": 1.04526901, + "balance_loss_mlp": 1.0095408, + "epoch": 0.15619551494017916, + "flos": 18515650095360.0, + "grad_norm": 2.1084686289126133, + "language_loss": 0.89222533, + "learning_rate": 3.8353792926920975e-06, + "loss": 0.91475141, + "num_input_tokens_seen": 27453570, + "step": 1299, + "time_per_iteration": 2.8291893005371094 + }, + { + "auxiliary_loss_clip": 0.01179438, + "auxiliary_loss_mlp": 0.01092843, + "balance_loss_clip": 1.04835081, + "balance_loss_mlp": 1.009444, + "epoch": 0.15631575783081825, + "flos": 19902125116800.0, + "grad_norm": 2.1371914684051547, + "language_loss": 0.81442511, + "learning_rate": 3.835069669597655e-06, + "loss": 0.83714795, + "num_input_tokens_seen": 27471960, + "step": 1300, + "time_per_iteration": 2.802433729171753 + }, + { + "auxiliary_loss_clip": 0.01181155, + "auxiliary_loss_mlp": 0.00874895, + "balance_loss_clip": 1.05003142, + "balance_loss_mlp": 1.00024033, + "epoch": 0.15643600072145733, + "flos": 20777663128320.0, + "grad_norm": 1.993938779475655, + "language_loss": 0.80015171, + "learning_rate": 3.834759768124603e-06, + "loss": 0.82071221, + "num_input_tokens_seen": 27490835, + "step": 1301, + "time_per_iteration": 2.70615291595459 + }, + { + "auxiliary_loss_clip": 0.01157308, + "auxiliary_loss_mlp": 0.01091788, + "balance_loss_clip": 1.04637599, + "balance_loss_mlp": 1.0086751, + "epoch": 0.15655624361209644, + "flos": 18546389159040.0, + "grad_norm": 3.3090696479062065, + "language_loss": 0.76473927, + "learning_rate": 3.834449588319953e-06, + "loss": 0.78723013, + "num_input_tokens_seen": 27508870, + "step": 1302, + "time_per_iteration": 2.946725606918335 + }, + { + "auxiliary_loss_clip": 0.01176749, + "auxiliary_loss_mlp": 0.01092643, + "balance_loss_clip": 1.04861999, + "balance_loss_mlp": 1.00991142, + "epoch": 0.15667648650273552, + "flos": 25229544727680.0, + "grad_norm": 1.9592204452826967, + "language_loss": 0.85381228, + "learning_rate": 3.834139130230758e-06, + "loss": 0.87650621, + "num_input_tokens_seen": 27528175, + "step": 1303, + "time_per_iteration": 2.811614751815796 + }, + { + "auxiliary_loss_clip": 0.0116653, + "auxiliary_loss_mlp": 0.01091769, + "balance_loss_clip": 1.04530859, + "balance_loss_mlp": 1.00870419, + "epoch": 0.1567967293933746, + "flos": 24827093769600.0, + "grad_norm": 2.4396439625342237, + "language_loss": 0.81139189, + "learning_rate": 3.833828393904117e-06, + "loss": 0.8339749, + "num_input_tokens_seen": 27548455, + "step": 1304, + "time_per_iteration": 2.8321847915649414 + }, + { + "auxiliary_loss_clip": 0.01148886, + "auxiliary_loss_mlp": 0.01092991, + "balance_loss_clip": 1.04409301, + "balance_loss_mlp": 1.00992632, + "epoch": 0.15691697228401372, + "flos": 19164555244800.0, + "grad_norm": 2.3146159280303515, + "language_loss": 0.77579153, + "learning_rate": 3.833517379387165e-06, + "loss": 0.79821026, + "num_input_tokens_seen": 27564910, + "step": 1305, + "time_per_iteration": 3.027820348739624 + }, + { + "auxiliary_loss_clip": 0.01180763, + "auxiliary_loss_mlp": 0.01092945, + "balance_loss_clip": 1.05034328, + "balance_loss_mlp": 1.00992775, + "epoch": 0.1570372151746528, + "flos": 24790931752320.0, + "grad_norm": 1.9308173526089736, + "language_loss": 0.8887437, + "learning_rate": 3.833206086727085e-06, + "loss": 0.91148078, + "num_input_tokens_seen": 27584260, + "step": 1306, + "time_per_iteration": 2.8871819972991943 + }, + { + "auxiliary_loss_clip": 0.01157906, + "auxiliary_loss_mlp": 0.010922, + "balance_loss_clip": 1.04485822, + "balance_loss_mlp": 1.00908744, + "epoch": 0.15715745806529188, + "flos": 24863650836480.0, + "grad_norm": 2.5696019932199294, + "language_loss": 0.70850748, + "learning_rate": 3.8328945159710994e-06, + "loss": 0.73100853, + "num_input_tokens_seen": 27604440, + "step": 1307, + "time_per_iteration": 2.93446946144104 + }, + { + "auxiliary_loss_clip": 0.01183014, + "auxiliary_loss_mlp": 0.00874753, + "balance_loss_clip": 1.05138457, + "balance_loss_mlp": 1.00020719, + "epoch": 0.157277700955931, + "flos": 21872148491520.0, + "grad_norm": 2.1756863580066916, + "language_loss": 0.8892777, + "learning_rate": 3.832582667166473e-06, + "loss": 0.90985537, + "num_input_tokens_seen": 27624250, + "step": 1308, + "time_per_iteration": 2.7845520973205566 + }, + { + "auxiliary_loss_clip": 0.01169099, + "auxiliary_loss_mlp": 0.01093388, + "balance_loss_clip": 1.04680347, + "balance_loss_mlp": 1.01022792, + "epoch": 0.15739794384657008, + "flos": 24533344344960.0, + "grad_norm": 2.11590322747719, + "language_loss": 0.81498796, + "learning_rate": 3.8322705403605125e-06, + "loss": 0.83761287, + "num_input_tokens_seen": 27644595, + "step": 1309, + "time_per_iteration": 2.9000887870788574 + }, + { + "auxiliary_loss_clip": 0.01162053, + "auxiliary_loss_mlp": 0.01089561, + "balance_loss_clip": 1.0440793, + "balance_loss_mlp": 1.00697279, + "epoch": 0.15751818673720916, + "flos": 17745329998080.0, + "grad_norm": 2.035613533955897, + "language_loss": 0.81754494, + "learning_rate": 3.831958135600568e-06, + "loss": 0.84006101, + "num_input_tokens_seen": 27662145, + "step": 1310, + "time_per_iteration": 2.854806423187256 + }, + { + "auxiliary_loss_clip": 0.01179064, + "auxiliary_loss_mlp": 0.01090468, + "balance_loss_clip": 1.04878688, + "balance_loss_mlp": 1.00778484, + "epoch": 0.15763842962784824, + "flos": 17858520731520.0, + "grad_norm": 1.9428552791081652, + "language_loss": 0.7964263, + "learning_rate": 3.831645452934032e-06, + "loss": 0.8191216, + "num_input_tokens_seen": 27680575, + "step": 1311, + "time_per_iteration": 2.6925132274627686 + }, + { + "auxiliary_loss_clip": 0.0118958, + "auxiliary_loss_mlp": 0.01094951, + "balance_loss_clip": 1.05122256, + "balance_loss_mlp": 1.01183867, + "epoch": 0.15775867251848735, + "flos": 26980908059520.0, + "grad_norm": 1.7294521594801082, + "language_loss": 0.80352163, + "learning_rate": 3.831332492408336e-06, + "loss": 0.82636696, + "num_input_tokens_seen": 27701985, + "step": 1312, + "time_per_iteration": 2.9051358699798584 + }, + { + "auxiliary_loss_clip": 0.01163594, + "auxiliary_loss_mlp": 0.01091805, + "balance_loss_clip": 1.04663217, + "balance_loss_mlp": 1.00888276, + "epoch": 0.15787891540912644, + "flos": 19240398812160.0, + "grad_norm": 1.8097235274246268, + "language_loss": 0.68714952, + "learning_rate": 3.831019254070957e-06, + "loss": 0.70970345, + "num_input_tokens_seen": 27719770, + "step": 1313, + "time_per_iteration": 4.727046728134155 + }, + { + "auxiliary_loss_clip": 0.01146777, + "auxiliary_loss_mlp": 0.01092694, + "balance_loss_clip": 1.04350221, + "balance_loss_mlp": 1.00986719, + "epoch": 0.15799915829976552, + "flos": 27271102037760.0, + "grad_norm": 2.839745434774647, + "language_loss": 0.94813865, + "learning_rate": 3.8307057379694135e-06, + "loss": 0.97053337, + "num_input_tokens_seen": 27739105, + "step": 1314, + "time_per_iteration": 2.9065170288085938 + }, + { + "auxiliary_loss_clip": 0.01186778, + "auxiliary_loss_mlp": 0.01090645, + "balance_loss_clip": 1.04841638, + "balance_loss_mlp": 1.00753236, + "epoch": 0.15811940119040463, + "flos": 20405520270720.0, + "grad_norm": 2.3167623535319493, + "language_loss": 0.8238973, + "learning_rate": 3.830391944151264e-06, + "loss": 0.84667158, + "num_input_tokens_seen": 27754985, + "step": 1315, + "time_per_iteration": 3.6931684017181396 + }, + { + "auxiliary_loss_clip": 0.01165964, + "auxiliary_loss_mlp": 0.01090488, + "balance_loss_clip": 1.04578269, + "balance_loss_mlp": 1.00761354, + "epoch": 0.1582396440810437, + "flos": 32599347661440.0, + "grad_norm": 1.871990002156012, + "language_loss": 0.67329252, + "learning_rate": 3.830077872664114e-06, + "loss": 0.69585705, + "num_input_tokens_seen": 27776110, + "step": 1316, + "time_per_iteration": 2.900413751602173 + }, + { + "auxiliary_loss_clip": 0.01131797, + "auxiliary_loss_mlp": 0.01092499, + "balance_loss_clip": 1.03944111, + "balance_loss_mlp": 1.00972033, + "epoch": 0.1583598869716828, + "flos": 33800559310080.0, + "grad_norm": 1.7785597837252014, + "language_loss": 0.73021638, + "learning_rate": 3.829763523555604e-06, + "loss": 0.75245941, + "num_input_tokens_seen": 27796510, + "step": 1317, + "time_per_iteration": 3.8706488609313965 + }, + { + "auxiliary_loss_clip": 0.01172153, + "auxiliary_loss_mlp": 0.0109193, + "balance_loss_clip": 1.04569888, + "balance_loss_mlp": 1.00943708, + "epoch": 0.15848012986232188, + "flos": 24681332378880.0, + "grad_norm": 2.206159484437493, + "language_loss": 0.77973145, + "learning_rate": 3.829448896873423e-06, + "loss": 0.80237234, + "num_input_tokens_seen": 27815610, + "step": 1318, + "time_per_iteration": 2.8989791870117188 + }, + { + "auxiliary_loss_clip": 0.01139007, + "auxiliary_loss_mlp": 0.00874744, + "balance_loss_clip": 1.04010081, + "balance_loss_mlp": 1.00025654, + "epoch": 0.158600372752961, + "flos": 22602068766720.0, + "grad_norm": 1.7365718846537748, + "language_loss": 0.79141688, + "learning_rate": 3.829133992665299e-06, + "loss": 0.81155437, + "num_input_tokens_seen": 27834735, + "step": 1319, + "time_per_iteration": 2.9197373390197754 + }, + { + "auxiliary_loss_clip": 0.01170326, + "auxiliary_loss_mlp": 0.01093575, + "balance_loss_clip": 1.04682064, + "balance_loss_mlp": 1.0107007, + "epoch": 0.15872061564360007, + "flos": 27927944092800.0, + "grad_norm": 2.0458608549676907, + "language_loss": 0.88892686, + "learning_rate": 3.828818810979002e-06, + "loss": 0.9115659, + "num_input_tokens_seen": 27853065, + "step": 1320, + "time_per_iteration": 2.782933473587036 + }, + { + "auxiliary_loss_clip": 0.01189175, + "auxiliary_loss_mlp": 0.01091712, + "balance_loss_clip": 1.05161405, + "balance_loss_mlp": 1.00907576, + "epoch": 0.15884085853423915, + "flos": 23696805525120.0, + "grad_norm": 1.8077868766193366, + "language_loss": 0.80549371, + "learning_rate": 3.8285033518623454e-06, + "loss": 0.82830262, + "num_input_tokens_seen": 27873315, + "step": 1321, + "time_per_iteration": 2.7496938705444336 + }, + { + "auxiliary_loss_clip": 0.01177722, + "auxiliary_loss_mlp": 0.01092503, + "balance_loss_clip": 1.04811013, + "balance_loss_mlp": 1.00934243, + "epoch": 0.15896110142487826, + "flos": 23112359331840.0, + "grad_norm": 7.000424275608295, + "language_loss": 0.81236982, + "learning_rate": 3.8281876153631845e-06, + "loss": 0.83507204, + "num_input_tokens_seen": 27890070, + "step": 1322, + "time_per_iteration": 2.7469005584716797 + }, + { + "auxiliary_loss_clip": 0.01149783, + "auxiliary_loss_mlp": 0.01093156, + "balance_loss_clip": 1.04668784, + "balance_loss_mlp": 1.01018596, + "epoch": 0.15908134431551735, + "flos": 14685238632960.0, + "grad_norm": 1.6934407300508003, + "language_loss": 0.64462602, + "learning_rate": 3.827871601529416e-06, + "loss": 0.66705543, + "num_input_tokens_seen": 27908590, + "step": 1323, + "time_per_iteration": 2.8551394939422607 + }, + { + "auxiliary_loss_clip": 0.01158781, + "auxiliary_loss_mlp": 0.01091595, + "balance_loss_clip": 1.04706264, + "balance_loss_mlp": 1.00881624, + "epoch": 0.15920158720615643, + "flos": 20193611984640.0, + "grad_norm": 1.674931418584533, + "language_loss": 0.8089, + "learning_rate": 3.827555310408979e-06, + "loss": 0.83140379, + "num_input_tokens_seen": 27927985, + "step": 1324, + "time_per_iteration": 2.750991106033325 + }, + { + "auxiliary_loss_clip": 0.01145049, + "auxiliary_loss_mlp": 0.01092756, + "balance_loss_clip": 1.03820789, + "balance_loss_mlp": 1.00964296, + "epoch": 0.1593218300967955, + "flos": 24826626892800.0, + "grad_norm": 1.6200017577601378, + "language_loss": 0.82822812, + "learning_rate": 3.827238742049854e-06, + "loss": 0.85060614, + "num_input_tokens_seen": 27948280, + "step": 1325, + "time_per_iteration": 2.8259928226470947 + }, + { + "auxiliary_loss_clip": 0.01186741, + "auxiliary_loss_mlp": 0.01090856, + "balance_loss_clip": 1.04865718, + "balance_loss_mlp": 1.00779104, + "epoch": 0.15944207298743462, + "flos": 28328707111680.0, + "grad_norm": 1.9441120214415897, + "language_loss": 0.51443666, + "learning_rate": 3.826921896500066e-06, + "loss": 0.53721261, + "num_input_tokens_seen": 27969565, + "step": 1326, + "time_per_iteration": 2.7432973384857178 + }, + { + "auxiliary_loss_clip": 0.01156246, + "auxiliary_loss_mlp": 0.01092396, + "balance_loss_clip": 1.04550052, + "balance_loss_mlp": 1.00923514, + "epoch": 0.1595623158780737, + "flos": 22964838174720.0, + "grad_norm": 2.6555754343004385, + "language_loss": 0.78256202, + "learning_rate": 3.826604773807678e-06, + "loss": 0.80504847, + "num_input_tokens_seen": 27987540, + "step": 1327, + "time_per_iteration": 2.850201368331909 + }, + { + "auxiliary_loss_clip": 0.01167426, + "auxiliary_loss_mlp": 0.01092272, + "balance_loss_clip": 1.04858518, + "balance_loss_mlp": 1.00915921, + "epoch": 0.1596825587687128, + "flos": 19710540950400.0, + "grad_norm": 2.9120537994714306, + "language_loss": 0.73588216, + "learning_rate": 3.826287374020798e-06, + "loss": 0.75847912, + "num_input_tokens_seen": 28002345, + "step": 1328, + "time_per_iteration": 2.7317497730255127 + }, + { + "auxiliary_loss_clip": 0.01187661, + "auxiliary_loss_mlp": 0.01090408, + "balance_loss_clip": 1.05022573, + "balance_loss_mlp": 1.00743794, + "epoch": 0.1598028016593519, + "flos": 22637727993600.0, + "grad_norm": 2.1952146888245925, + "language_loss": 0.82249731, + "learning_rate": 3.825969697187575e-06, + "loss": 0.84527802, + "num_input_tokens_seen": 28021675, + "step": 1329, + "time_per_iteration": 2.689126491546631 + }, + { + "auxiliary_loss_clip": 0.01158848, + "auxiliary_loss_mlp": 0.01094378, + "balance_loss_clip": 1.04572701, + "balance_loss_mlp": 1.0111227, + "epoch": 0.15992304454999098, + "flos": 20482908122880.0, + "grad_norm": 1.767434792782335, + "language_loss": 0.69631147, + "learning_rate": 3.8256517433562015e-06, + "loss": 0.7188437, + "num_input_tokens_seen": 28039615, + "step": 1330, + "time_per_iteration": 2.8586082458496094 + }, + { + "auxiliary_loss_clip": 0.01188807, + "auxiliary_loss_mlp": 0.0109307, + "balance_loss_clip": 1.05138707, + "balance_loss_mlp": 1.01019561, + "epoch": 0.16004328744063007, + "flos": 17676094533120.0, + "grad_norm": 2.577537414034096, + "language_loss": 0.92098713, + "learning_rate": 3.82533351257491e-06, + "loss": 0.94380593, + "num_input_tokens_seen": 28057565, + "step": 1331, + "time_per_iteration": 2.6524112224578857 + }, + { + "auxiliary_loss_clip": 0.0117394, + "auxiliary_loss_mlp": 0.01092289, + "balance_loss_clip": 1.04675305, + "balance_loss_mlp": 1.00936699, + "epoch": 0.16016353033126918, + "flos": 24098717779200.0, + "grad_norm": 1.7584845711807773, + "language_loss": 0.88898611, + "learning_rate": 3.825015004891975e-06, + "loss": 0.91164845, + "num_input_tokens_seen": 28076305, + "step": 1332, + "time_per_iteration": 2.721092462539673 + }, + { + "auxiliary_loss_clip": 0.01178094, + "auxiliary_loss_mlp": 0.01090586, + "balance_loss_clip": 1.04890585, + "balance_loss_mlp": 1.0078547, + "epoch": 0.16028377322190826, + "flos": 27634841112960.0, + "grad_norm": 1.9406381762496805, + "language_loss": 0.75822848, + "learning_rate": 3.824696220355716e-06, + "loss": 0.78091526, + "num_input_tokens_seen": 28097895, + "step": 1333, + "time_per_iteration": 2.8029558658599854 + }, + { + "auxiliary_loss_clip": 0.01162005, + "auxiliary_loss_mlp": 0.01090969, + "balance_loss_clip": 1.04431868, + "balance_loss_mlp": 1.00790417, + "epoch": 0.16040401611254734, + "flos": 20961202648320.0, + "grad_norm": 1.6112616801533959, + "language_loss": 0.78873658, + "learning_rate": 3.824377159014491e-06, + "loss": 0.8112663, + "num_input_tokens_seen": 28118790, + "step": 1334, + "time_per_iteration": 2.7994439601898193 + }, + { + "auxiliary_loss_clip": 0.01175236, + "auxiliary_loss_mlp": 0.01090689, + "balance_loss_clip": 1.04852438, + "balance_loss_mlp": 1.00781488, + "epoch": 0.16052425900318643, + "flos": 21247051080960.0, + "grad_norm": 1.929788463954882, + "language_loss": 0.84838295, + "learning_rate": 3.824057820916702e-06, + "loss": 0.87104213, + "num_input_tokens_seen": 28135995, + "step": 1335, + "time_per_iteration": 2.738175630569458 + }, + { + "auxiliary_loss_clip": 0.01160961, + "auxiliary_loss_mlp": 0.01089455, + "balance_loss_clip": 1.04279542, + "balance_loss_mlp": 1.00639033, + "epoch": 0.16064450189382554, + "flos": 15524004096000.0, + "grad_norm": 2.572498307914703, + "language_loss": 0.71663368, + "learning_rate": 3.8237382061107904e-06, + "loss": 0.73913789, + "num_input_tokens_seen": 28152715, + "step": 1336, + "time_per_iteration": 2.7700839042663574 + }, + { + "auxiliary_loss_clip": 0.01114364, + "auxiliary_loss_mlp": 0.01092095, + "balance_loss_clip": 1.03675079, + "balance_loss_mlp": 1.00931609, + "epoch": 0.16076474478446462, + "flos": 21178497974400.0, + "grad_norm": 1.6306929119179576, + "language_loss": 0.78491402, + "learning_rate": 3.823418314645243e-06, + "loss": 0.80697858, + "num_input_tokens_seen": 28171590, + "step": 1337, + "time_per_iteration": 2.8840487003326416 + }, + { + "auxiliary_loss_clip": 0.01130859, + "auxiliary_loss_mlp": 0.01092701, + "balance_loss_clip": 1.03917897, + "balance_loss_mlp": 1.00987387, + "epoch": 0.1608849876751037, + "flos": 18366476912640.0, + "grad_norm": 1.885592910613149, + "language_loss": 0.7530663, + "learning_rate": 3.823098146568588e-06, + "loss": 0.77530193, + "num_input_tokens_seen": 28191295, + "step": 1338, + "time_per_iteration": 5.000199317932129 + }, + { + "auxiliary_loss_clip": 0.01175436, + "auxiliary_loss_mlp": 0.01090056, + "balance_loss_clip": 1.04804718, + "balance_loss_mlp": 1.00737202, + "epoch": 0.1610052305657428, + "flos": 29497024880640.0, + "grad_norm": 1.8868050354970378, + "language_loss": 0.71710694, + "learning_rate": 3.822777701929394e-06, + "loss": 0.73976183, + "num_input_tokens_seen": 28213120, + "step": 1339, + "time_per_iteration": 2.7459774017333984 + }, + { + "auxiliary_loss_clip": 0.01166577, + "auxiliary_loss_mlp": 0.01091997, + "balance_loss_clip": 1.0435307, + "balance_loss_mlp": 1.00936079, + "epoch": 0.1611254734563819, + "flos": 26797871329920.0, + "grad_norm": 1.7822379463545408, + "language_loss": 0.73654425, + "learning_rate": 3.8224569807762714e-06, + "loss": 0.75913, + "num_input_tokens_seen": 28232440, + "step": 1340, + "time_per_iteration": 3.69435453414917 + }, + { + "auxiliary_loss_clip": 0.01151011, + "auxiliary_loss_mlp": 0.01089746, + "balance_loss_clip": 1.0471282, + "balance_loss_mlp": 1.0070622, + "epoch": 0.16124571634702098, + "flos": 22419570741120.0, + "grad_norm": 1.7654032494453635, + "language_loss": 0.76316929, + "learning_rate": 3.822135983157873e-06, + "loss": 0.78557694, + "num_input_tokens_seen": 28251715, + "step": 1341, + "time_per_iteration": 2.9043498039245605 + }, + { + "auxiliary_loss_clip": 0.01184242, + "auxiliary_loss_mlp": 0.00874536, + "balance_loss_clip": 1.04764891, + "balance_loss_mlp": 1.00021482, + "epoch": 0.16136595923766006, + "flos": 10999116103680.0, + "grad_norm": 2.179639369641711, + "language_loss": 0.84033, + "learning_rate": 3.821814709122896e-06, + "loss": 0.86091775, + "num_input_tokens_seen": 28269765, + "step": 1342, + "time_per_iteration": 2.7662465572357178 + }, + { + "auxiliary_loss_clip": 0.01166438, + "auxiliary_loss_mlp": 0.01090861, + "balance_loss_clip": 1.04826629, + "balance_loss_mlp": 1.00812936, + "epoch": 0.16148620212829917, + "flos": 21214983214080.0, + "grad_norm": 2.0481628821538043, + "language_loss": 0.84818345, + "learning_rate": 3.821493158720076e-06, + "loss": 0.87075639, + "num_input_tokens_seen": 28288870, + "step": 1343, + "time_per_iteration": 3.716581106185913 + }, + { + "auxiliary_loss_clip": 0.01146452, + "auxiliary_loss_mlp": 0.01089001, + "balance_loss_clip": 1.04055011, + "balance_loss_mlp": 1.00617409, + "epoch": 0.16160644501893826, + "flos": 16758468760320.0, + "grad_norm": 3.689943261333169, + "language_loss": 0.73508233, + "learning_rate": 3.821171331998191e-06, + "loss": 0.75743687, + "num_input_tokens_seen": 28305400, + "step": 1344, + "time_per_iteration": 2.769785165786743 + }, + { + "auxiliary_loss_clip": 0.01178096, + "auxiliary_loss_mlp": 0.01080236, + "balance_loss_clip": 1.07892394, + "balance_loss_mlp": 1.0001272, + "epoch": 0.16172668790957734, + "flos": 64444967308800.0, + "grad_norm": 0.7072028631147425, + "language_loss": 0.5446552, + "learning_rate": 3.820849229006064e-06, + "loss": 0.56723857, + "num_input_tokens_seen": 28373150, + "step": 1345, + "time_per_iteration": 3.5019822120666504 + }, + { + "auxiliary_loss_clip": 0.01183251, + "auxiliary_loss_mlp": 0.0109245, + "balance_loss_clip": 1.04651558, + "balance_loss_mlp": 1.00957584, + "epoch": 0.16184693080021645, + "flos": 23257689759360.0, + "grad_norm": 2.015463958593201, + "language_loss": 0.70961446, + "learning_rate": 3.8205268497925564e-06, + "loss": 0.73237145, + "num_input_tokens_seen": 28393620, + "step": 1346, + "time_per_iteration": 2.684997797012329 + }, + { + "auxiliary_loss_clip": 0.01186256, + "auxiliary_loss_mlp": 0.01089974, + "balance_loss_clip": 1.04981136, + "balance_loss_mlp": 1.00719464, + "epoch": 0.16196717369085553, + "flos": 17451113696640.0, + "grad_norm": 2.052378142415891, + "language_loss": 0.78243208, + "learning_rate": 3.8202041944065725e-06, + "loss": 0.80519438, + "num_input_tokens_seen": 28409440, + "step": 1347, + "time_per_iteration": 2.7138102054595947 + }, + { + "auxiliary_loss_clip": 0.01184127, + "auxiliary_loss_mlp": 0.01091576, + "balance_loss_clip": 1.04831481, + "balance_loss_mlp": 1.00898814, + "epoch": 0.16208741658149461, + "flos": 23873377806720.0, + "grad_norm": 1.914485470140746, + "language_loss": 0.73554808, + "learning_rate": 3.819881262897061e-06, + "loss": 0.75830513, + "num_input_tokens_seen": 28427575, + "step": 1348, + "time_per_iteration": 2.7572402954101562 + }, + { + "auxiliary_loss_clip": 0.01148659, + "auxiliary_loss_mlp": 0.01088138, + "balance_loss_clip": 1.04366052, + "balance_loss_mlp": 1.00550246, + "epoch": 0.1622076594721337, + "flos": 25884806584320.0, + "grad_norm": 1.8384932841258117, + "language_loss": 0.73623776, + "learning_rate": 3.819558055313008e-06, + "loss": 0.75860578, + "num_input_tokens_seen": 28448260, + "step": 1349, + "time_per_iteration": 2.8912501335144043 + }, + { + "auxiliary_loss_clip": 0.01174935, + "auxiliary_loss_mlp": 0.01095639, + "balance_loss_clip": 1.04693794, + "balance_loss_mlp": 1.01266932, + "epoch": 0.1623279023627728, + "flos": 21539759011200.0, + "grad_norm": 1.807796402510907, + "language_loss": 0.77454174, + "learning_rate": 3.819234571703444e-06, + "loss": 0.79724747, + "num_input_tokens_seen": 28467085, + "step": 1350, + "time_per_iteration": 2.7175629138946533 + }, + { + "auxiliary_loss_clip": 0.01176109, + "auxiliary_loss_mlp": 0.01091613, + "balance_loss_clip": 1.04739451, + "balance_loss_mlp": 1.00864363, + "epoch": 0.1624481452534119, + "flos": 22085421494400.0, + "grad_norm": 1.9197703194330213, + "language_loss": 0.85578352, + "learning_rate": 3.8189108121174435e-06, + "loss": 0.87846076, + "num_input_tokens_seen": 28486850, + "step": 1351, + "time_per_iteration": 2.707719326019287 + }, + { + "auxiliary_loss_clip": 0.01138322, + "auxiliary_loss_mlp": 0.01091926, + "balance_loss_clip": 1.03641319, + "balance_loss_mlp": 1.00924265, + "epoch": 0.16256838814405097, + "flos": 27087490690560.0, + "grad_norm": 1.856268137714677, + "language_loss": 0.83576578, + "learning_rate": 3.818586776604118e-06, + "loss": 0.85806829, + "num_input_tokens_seen": 28507490, + "step": 1352, + "time_per_iteration": 2.8527650833129883 + }, + { + "auxiliary_loss_clip": 0.01170773, + "auxiliary_loss_mlp": 0.01092556, + "balance_loss_clip": 1.05040026, + "balance_loss_mlp": 1.00982499, + "epoch": 0.16268863103469008, + "flos": 20120354196480.0, + "grad_norm": 1.9566024755766296, + "language_loss": 0.61428344, + "learning_rate": 3.818262465212625e-06, + "loss": 0.63691682, + "num_input_tokens_seen": 28527615, + "step": 1353, + "time_per_iteration": 2.716665744781494 + }, + { + "auxiliary_loss_clip": 0.01171511, + "auxiliary_loss_mlp": 0.01092272, + "balance_loss_clip": 1.0492059, + "balance_loss_mlp": 1.00925517, + "epoch": 0.16280887392532917, + "flos": 18332792933760.0, + "grad_norm": 2.0587379160705725, + "language_loss": 0.77354884, + "learning_rate": 3.817937877992161e-06, + "loss": 0.79618663, + "num_input_tokens_seen": 28544910, + "step": 1354, + "time_per_iteration": 2.79427433013916 + }, + { + "auxiliary_loss_clip": 0.01160711, + "auxiliary_loss_mlp": 0.00874692, + "balance_loss_clip": 1.04862678, + "balance_loss_mlp": 1.00019073, + "epoch": 0.16292911681596825, + "flos": 11874330892800.0, + "grad_norm": 2.2624690584347027, + "language_loss": 0.8569786, + "learning_rate": 3.817613014991967e-06, + "loss": 0.87733269, + "num_input_tokens_seen": 28561050, + "step": 1355, + "time_per_iteration": 2.7847609519958496 + }, + { + "auxiliary_loss_clip": 0.01148513, + "auxiliary_loss_mlp": 0.01093093, + "balance_loss_clip": 1.0442698, + "balance_loss_mlp": 1.01012278, + "epoch": 0.16304935970660733, + "flos": 26103466627200.0, + "grad_norm": 1.861527901151608, + "language_loss": 0.76007819, + "learning_rate": 3.817287876261323e-06, + "loss": 0.78249419, + "num_input_tokens_seen": 28581385, + "step": 1356, + "time_per_iteration": 2.893831968307495 + }, + { + "auxiliary_loss_clip": 0.01153013, + "auxiliary_loss_mlp": 0.01092515, + "balance_loss_clip": 1.04310703, + "balance_loss_mlp": 1.00954556, + "epoch": 0.16316960259724644, + "flos": 29351945848320.0, + "grad_norm": 2.7296076523587773, + "language_loss": 0.80097055, + "learning_rate": 3.816962461849553e-06, + "loss": 0.82342577, + "num_input_tokens_seen": 28603255, + "step": 1357, + "time_per_iteration": 2.8816335201263428 + }, + { + "auxiliary_loss_clip": 0.01159763, + "auxiliary_loss_mlp": 0.01090452, + "balance_loss_clip": 1.04333901, + "balance_loss_mlp": 1.00767303, + "epoch": 0.16328984548788553, + "flos": 20886759711360.0, + "grad_norm": 1.798765043678204, + "language_loss": 0.84273565, + "learning_rate": 3.8166367718060235e-06, + "loss": 0.86523777, + "num_input_tokens_seen": 28623145, + "step": 1358, + "time_per_iteration": 2.800245761871338 + }, + { + "auxiliary_loss_clip": 0.01174008, + "auxiliary_loss_mlp": 0.01091211, + "balance_loss_clip": 1.04603887, + "balance_loss_mlp": 1.00857508, + "epoch": 0.1634100883785246, + "flos": 18041090584320.0, + "grad_norm": 3.4708422382980713, + "language_loss": 0.7617718, + "learning_rate": 3.816310806180139e-06, + "loss": 0.78442401, + "num_input_tokens_seen": 28641555, + "step": 1359, + "time_per_iteration": 2.723191738128662 + }, + { + "auxiliary_loss_clip": 0.01152062, + "auxiliary_loss_mlp": 0.01093874, + "balance_loss_clip": 1.04185069, + "balance_loss_mlp": 1.01076174, + "epoch": 0.16353033126916372, + "flos": 24572128055040.0, + "grad_norm": 1.63402882949347, + "language_loss": 0.81120187, + "learning_rate": 3.81598456502135e-06, + "loss": 0.8336612, + "num_input_tokens_seen": 28661575, + "step": 1360, + "time_per_iteration": 2.7376391887664795 + }, + { + "auxiliary_loss_clip": 0.01153778, + "auxiliary_loss_mlp": 0.01089183, + "balance_loss_clip": 1.04409337, + "balance_loss_mlp": 1.00659454, + "epoch": 0.1636505741598028, + "flos": 19892895321600.0, + "grad_norm": 1.9370374236901038, + "language_loss": 0.87260348, + "learning_rate": 3.8156580483791455e-06, + "loss": 0.89503306, + "num_input_tokens_seen": 28676765, + "step": 1361, + "time_per_iteration": 2.7862939834594727 + }, + { + "auxiliary_loss_clip": 0.01184301, + "auxiliary_loss_mlp": 0.01092905, + "balance_loss_clip": 1.04862452, + "balance_loss_mlp": 1.0104121, + "epoch": 0.16377081705044189, + "flos": 28402611344640.0, + "grad_norm": 3.1855354808390937, + "language_loss": 0.77219152, + "learning_rate": 3.815331256303059e-06, + "loss": 0.7949636, + "num_input_tokens_seen": 28696795, + "step": 1362, + "time_per_iteration": 2.7509799003601074 + }, + { + "auxiliary_loss_clip": 0.01147211, + "auxiliary_loss_mlp": 0.01090328, + "balance_loss_clip": 1.04268122, + "balance_loss_mlp": 1.0076443, + "epoch": 0.163891059941081, + "flos": 21908059113600.0, + "grad_norm": 2.2737770321042516, + "language_loss": 0.77502429, + "learning_rate": 3.815004188842665e-06, + "loss": 0.79739976, + "num_input_tokens_seen": 28714835, + "step": 1363, + "time_per_iteration": 3.844587802886963 + }, + { + "auxiliary_loss_clip": 0.01165831, + "auxiliary_loss_mlp": 0.01089688, + "balance_loss_clip": 1.04619133, + "balance_loss_mlp": 1.00714779, + "epoch": 0.16401130283172008, + "flos": 26797619934720.0, + "grad_norm": 1.6062752483602953, + "language_loss": 0.79877776, + "learning_rate": 3.814676846047578e-06, + "loss": 0.82133299, + "num_input_tokens_seen": 28735710, + "step": 1364, + "time_per_iteration": 3.7808384895324707 + }, + { + "auxiliary_loss_clip": 0.01174686, + "auxiliary_loss_mlp": 0.01093939, + "balance_loss_clip": 1.04688823, + "balance_loss_mlp": 1.01120782, + "epoch": 0.16413154572235916, + "flos": 32997417160320.0, + "grad_norm": 1.9108603480203081, + "language_loss": 0.69469023, + "learning_rate": 3.8143492279674565e-06, + "loss": 0.71737647, + "num_input_tokens_seen": 28758405, + "step": 1365, + "time_per_iteration": 2.852938652038574 + }, + { + "auxiliary_loss_clip": 0.01178459, + "auxiliary_loss_mlp": 0.01080976, + "balance_loss_clip": 1.07999539, + "balance_loss_mlp": 1.00086772, + "epoch": 0.16425178861299825, + "flos": 40113622074240.0, + "grad_norm": 0.9127522677705584, + "language_loss": 0.58451498, + "learning_rate": 3.8140213346519997e-06, + "loss": 0.60710943, + "num_input_tokens_seen": 28809000, + "step": 1366, + "time_per_iteration": 4.105006217956543 + }, + { + "auxiliary_loss_clip": 0.01141967, + "auxiliary_loss_mlp": 0.01090173, + "balance_loss_clip": 1.04362214, + "balance_loss_mlp": 1.00744224, + "epoch": 0.16437203150363736, + "flos": 25447486498560.0, + "grad_norm": 1.7481138400070246, + "language_loss": 0.77267075, + "learning_rate": 3.813693166150948e-06, + "loss": 0.79499221, + "num_input_tokens_seen": 28829210, + "step": 1367, + "time_per_iteration": 2.8971714973449707 + }, + { + "auxiliary_loss_clip": 0.01150517, + "auxiliary_loss_mlp": 0.01091303, + "balance_loss_clip": 1.04329681, + "balance_loss_mlp": 1.00838089, + "epoch": 0.16449227439427644, + "flos": 23476888506240.0, + "grad_norm": 2.0559172917712596, + "language_loss": 0.85587859, + "learning_rate": 3.813364722514086e-06, + "loss": 0.87829685, + "num_input_tokens_seen": 28847545, + "step": 1368, + "time_per_iteration": 3.7947380542755127 + }, + { + "auxiliary_loss_clip": 0.01175842, + "auxiliary_loss_mlp": 0.01090272, + "balance_loss_clip": 1.04879642, + "balance_loss_mlp": 1.00758827, + "epoch": 0.16461251728491552, + "flos": 13545217802880.0, + "grad_norm": 2.4936829075829006, + "language_loss": 0.8033548, + "learning_rate": 3.8130360037912368e-06, + "loss": 0.82601595, + "num_input_tokens_seen": 28863990, + "step": 1369, + "time_per_iteration": 2.7343692779541016 + }, + { + "auxiliary_loss_clip": 0.01165508, + "auxiliary_loss_mlp": 0.0108995, + "balance_loss_clip": 1.04217684, + "balance_loss_mlp": 1.00717056, + "epoch": 0.16473276017555463, + "flos": 23003298662400.0, + "grad_norm": 9.237218565938512, + "language_loss": 0.81827354, + "learning_rate": 3.812707010032268e-06, + "loss": 0.84082806, + "num_input_tokens_seen": 28883045, + "step": 1370, + "time_per_iteration": 2.743393898010254 + }, + { + "auxiliary_loss_clip": 0.0117506, + "auxiliary_loss_mlp": 0.0109177, + "balance_loss_clip": 1.04870486, + "balance_loss_mlp": 1.00913405, + "epoch": 0.16485300306619372, + "flos": 24790680357120.0, + "grad_norm": 2.928512410369909, + "language_loss": 0.79447925, + "learning_rate": 3.8123777412870863e-06, + "loss": 0.81714761, + "num_input_tokens_seen": 28902545, + "step": 1371, + "time_per_iteration": 2.7726852893829346 + }, + { + "auxiliary_loss_clip": 0.0116266, + "auxiliary_loss_mlp": 0.01091703, + "balance_loss_clip": 1.04483056, + "balance_loss_mlp": 1.00911462, + "epoch": 0.1649732459568328, + "flos": 21106497162240.0, + "grad_norm": 1.882469392652005, + "language_loss": 0.78263795, + "learning_rate": 3.812048197605643e-06, + "loss": 0.80518156, + "num_input_tokens_seen": 28921440, + "step": 1372, + "time_per_iteration": 2.7810285091400146 + }, + { + "auxiliary_loss_clip": 0.01169285, + "auxiliary_loss_mlp": 0.01092158, + "balance_loss_clip": 1.04453945, + "balance_loss_mlp": 1.0092839, + "epoch": 0.16509348884747188, + "flos": 20266726118400.0, + "grad_norm": 1.883913619162804, + "language_loss": 0.81526542, + "learning_rate": 3.8117183790379277e-06, + "loss": 0.83787984, + "num_input_tokens_seen": 28939890, + "step": 1373, + "time_per_iteration": 2.7416868209838867 + }, + { + "auxiliary_loss_clip": 0.01181641, + "auxiliary_loss_mlp": 0.01090571, + "balance_loss_clip": 1.04668689, + "balance_loss_mlp": 1.00803089, + "epoch": 0.165213731738111, + "flos": 11035493602560.0, + "grad_norm": 2.828293008949056, + "language_loss": 0.94433308, + "learning_rate": 3.811388285633976e-06, + "loss": 0.9670552, + "num_input_tokens_seen": 28955875, + "step": 1374, + "time_per_iteration": 2.7538819313049316 + }, + { + "auxiliary_loss_clip": 0.01141897, + "auxiliary_loss_mlp": 0.01092178, + "balance_loss_clip": 1.04372954, + "balance_loss_mlp": 1.00925577, + "epoch": 0.16533397462875007, + "flos": 29972051268480.0, + "grad_norm": 2.2417138599339825, + "language_loss": 0.6217097, + "learning_rate": 3.811057917443861e-06, + "loss": 0.64405048, + "num_input_tokens_seen": 28975140, + "step": 1375, + "time_per_iteration": 2.940081834793091 + }, + { + "auxiliary_loss_clip": 0.0118718, + "auxiliary_loss_mlp": 0.01080619, + "balance_loss_clip": 1.08028889, + "balance_loss_mlp": 1.00051022, + "epoch": 0.16545421751938916, + "flos": 65556763027200.0, + "grad_norm": 0.8572433570593949, + "language_loss": 0.68394458, + "learning_rate": 3.8107272745177e-06, + "loss": 0.7066226, + "num_input_tokens_seen": 29047470, + "step": 1376, + "time_per_iteration": 3.472034215927124 + }, + { + "auxiliary_loss_clip": 0.01143728, + "auxiliary_loss_mlp": 0.01090163, + "balance_loss_clip": 1.03987634, + "balance_loss_mlp": 1.00738406, + "epoch": 0.16557446041002827, + "flos": 22492361652480.0, + "grad_norm": 1.9010443275745186, + "language_loss": 0.78615153, + "learning_rate": 3.8103963569056513e-06, + "loss": 0.80849046, + "num_input_tokens_seen": 29066605, + "step": 1377, + "time_per_iteration": 2.8409204483032227 + }, + { + "auxiliary_loss_clip": 0.01163539, + "auxiliary_loss_mlp": 0.01092358, + "balance_loss_clip": 1.04664075, + "balance_loss_mlp": 1.01000834, + "epoch": 0.16569470330066735, + "flos": 24602723464320.0, + "grad_norm": 1.6216339653361138, + "language_loss": 0.88174033, + "learning_rate": 3.8100651646579146e-06, + "loss": 0.90429932, + "num_input_tokens_seen": 29085815, + "step": 1378, + "time_per_iteration": 2.791003942489624 + }, + { + "auxiliary_loss_clip": 0.0116496, + "auxiliary_loss_mlp": 0.01092011, + "balance_loss_clip": 1.04609966, + "balance_loss_mlp": 1.00937462, + "epoch": 0.16581494619130643, + "flos": 15006207588480.0, + "grad_norm": 1.9266221193529687, + "language_loss": 0.92465001, + "learning_rate": 3.8097336978247317e-06, + "loss": 0.94721967, + "num_input_tokens_seen": 29102520, + "step": 1379, + "time_per_iteration": 2.892866849899292 + }, + { + "auxiliary_loss_clip": 0.01150905, + "auxiliary_loss_mlp": 0.01090026, + "balance_loss_clip": 1.04479098, + "balance_loss_mlp": 1.0071044, + "epoch": 0.16593518908194552, + "flos": 17420338719360.0, + "grad_norm": 2.021360234695856, + "language_loss": 0.88920617, + "learning_rate": 3.8094019564563854e-06, + "loss": 0.91161549, + "num_input_tokens_seen": 29119450, + "step": 1380, + "time_per_iteration": 2.770684003829956 + }, + { + "auxiliary_loss_clip": 0.01181933, + "auxiliary_loss_mlp": 0.00874688, + "balance_loss_clip": 1.0471487, + "balance_loss_mlp": 1.00024891, + "epoch": 0.16605543197258463, + "flos": 20412631163520.0, + "grad_norm": 2.0082710625310223, + "language_loss": 0.7497257, + "learning_rate": 3.809069940603201e-06, + "loss": 0.77029192, + "num_input_tokens_seen": 29137405, + "step": 1381, + "time_per_iteration": 2.7413432598114014 + }, + { + "auxiliary_loss_clip": 0.01162, + "auxiliary_loss_mlp": 0.01090734, + "balance_loss_clip": 1.04549408, + "balance_loss_mlp": 1.00814533, + "epoch": 0.1661756748632237, + "flos": 14209745368320.0, + "grad_norm": 2.4047690528060106, + "language_loss": 0.7736885, + "learning_rate": 3.8087376503155452e-06, + "loss": 0.79621583, + "num_input_tokens_seen": 29154890, + "step": 1382, + "time_per_iteration": 2.805773973464966 + }, + { + "auxiliary_loss_clip": 0.01186778, + "auxiliary_loss_mlp": 0.01080404, + "balance_loss_clip": 1.08018577, + "balance_loss_mlp": 1.00029564, + "epoch": 0.1662959177538628, + "flos": 66080877350400.0, + "grad_norm": 0.8980019760312924, + "language_loss": 0.5624547, + "learning_rate": 3.808405085643826e-06, + "loss": 0.58512652, + "num_input_tokens_seen": 29219770, + "step": 1383, + "time_per_iteration": 3.3643105030059814 + }, + { + "auxiliary_loss_clip": 0.0118249, + "auxiliary_loss_mlp": 0.00874691, + "balance_loss_clip": 1.04791272, + "balance_loss_mlp": 1.00022769, + "epoch": 0.1664161606445019, + "flos": 20740567357440.0, + "grad_norm": 1.879541969726524, + "language_loss": 0.88746536, + "learning_rate": 3.8080722466384925e-06, + "loss": 0.90803719, + "num_input_tokens_seen": 29237620, + "step": 1384, + "time_per_iteration": 2.7714977264404297 + }, + { + "auxiliary_loss_clip": 0.01179321, + "auxiliary_loss_mlp": 0.01090691, + "balance_loss_clip": 1.0451895, + "balance_loss_mlp": 1.00786412, + "epoch": 0.166536403535141, + "flos": 25260930236160.0, + "grad_norm": 2.0702962666598346, + "language_loss": 0.71071613, + "learning_rate": 3.8077391333500376e-06, + "loss": 0.7334162, + "num_input_tokens_seen": 29256760, + "step": 1385, + "time_per_iteration": 2.722756862640381 + }, + { + "auxiliary_loss_clip": 0.01162934, + "auxiliary_loss_mlp": 0.01089055, + "balance_loss_clip": 1.0469923, + "balance_loss_mlp": 1.00646687, + "epoch": 0.16665664642578007, + "flos": 25447450584960.0, + "grad_norm": 1.6954136442763823, + "language_loss": 0.76739711, + "learning_rate": 3.8074057458289934e-06, + "loss": 0.78991699, + "num_input_tokens_seen": 29277450, + "step": 1386, + "time_per_iteration": 2.82077693939209 + }, + { + "auxiliary_loss_clip": 0.01160192, + "auxiliary_loss_mlp": 0.010924, + "balance_loss_clip": 1.04430735, + "balance_loss_mlp": 1.00957358, + "epoch": 0.16677688931641918, + "flos": 22200767043840.0, + "grad_norm": 2.231338968058559, + "language_loss": 0.82484293, + "learning_rate": 3.807072084125934e-06, + "loss": 0.8473689, + "num_input_tokens_seen": 29299300, + "step": 1387, + "time_per_iteration": 2.7903528213500977 + }, + { + "auxiliary_loss_clip": 0.01149878, + "auxiliary_loss_mlp": 0.01090319, + "balance_loss_clip": 1.03882337, + "balance_loss_mlp": 1.00749183, + "epoch": 0.16689713220705826, + "flos": 16945958776320.0, + "grad_norm": 2.182880285491305, + "language_loss": 0.80901581, + "learning_rate": 3.806738148291477e-06, + "loss": 0.8314178, + "num_input_tokens_seen": 29316125, + "step": 1388, + "time_per_iteration": 3.685781240463257 + }, + { + "auxiliary_loss_clip": 0.01133854, + "auxiliary_loss_mlp": 0.01091555, + "balance_loss_clip": 1.0433991, + "balance_loss_mlp": 1.00863302, + "epoch": 0.16701737509769735, + "flos": 36244423923840.0, + "grad_norm": 1.8440760592117926, + "language_loss": 0.71444571, + "learning_rate": 3.8064039383762793e-06, + "loss": 0.73669982, + "num_input_tokens_seen": 29338490, + "step": 1389, + "time_per_iteration": 3.0153701305389404 + }, + { + "auxiliary_loss_clip": 0.01164838, + "auxiliary_loss_mlp": 0.01091437, + "balance_loss_clip": 1.04259419, + "balance_loss_mlp": 1.00894368, + "epoch": 0.16713761798833643, + "flos": 23258659426560.0, + "grad_norm": 2.3728490306860244, + "language_loss": 0.77244622, + "learning_rate": 3.8060694544310396e-06, + "loss": 0.79500902, + "num_input_tokens_seen": 29357000, + "step": 1390, + "time_per_iteration": 3.7614521980285645 + }, + { + "auxiliary_loss_clip": 0.01182662, + "auxiliary_loss_mlp": 0.01091176, + "balance_loss_clip": 1.04863167, + "balance_loss_mlp": 1.00811088, + "epoch": 0.16725786087897554, + "flos": 25302515207040.0, + "grad_norm": 1.763015325923877, + "language_loss": 0.7867043, + "learning_rate": 3.8057346965065006e-06, + "loss": 0.8094427, + "num_input_tokens_seen": 29378230, + "step": 1391, + "time_per_iteration": 3.688121795654297 + }, + { + "auxiliary_loss_clip": 0.01153746, + "auxiliary_loss_mlp": 0.01091193, + "balance_loss_clip": 1.0431149, + "balance_loss_mlp": 1.00865245, + "epoch": 0.16737810376961462, + "flos": 31831541516160.0, + "grad_norm": 2.1054064122245117, + "language_loss": 0.84561622, + "learning_rate": 3.805399664653443e-06, + "loss": 0.8680656, + "num_input_tokens_seen": 29400370, + "step": 1392, + "time_per_iteration": 2.916982889175415 + }, + { + "auxiliary_loss_clip": 0.0118129, + "auxiliary_loss_mlp": 0.01090048, + "balance_loss_clip": 1.04700601, + "balance_loss_mlp": 1.00726914, + "epoch": 0.1674983466602537, + "flos": 27961843553280.0, + "grad_norm": 2.3734343857979545, + "language_loss": 0.74720377, + "learning_rate": 3.805064358922692e-06, + "loss": 0.76991713, + "num_input_tokens_seen": 29418660, + "step": 1393, + "time_per_iteration": 3.7240545749664307 + }, + { + "auxiliary_loss_clip": 0.01172816, + "auxiliary_loss_mlp": 0.01092259, + "balance_loss_clip": 1.04732096, + "balance_loss_mlp": 1.00948036, + "epoch": 0.16761858955089282, + "flos": 21762656858880.0, + "grad_norm": 1.9040646254294162, + "language_loss": 0.810094, + "learning_rate": 3.8047287793651136e-06, + "loss": 0.83274472, + "num_input_tokens_seen": 29440105, + "step": 1394, + "time_per_iteration": 2.766846179962158 + }, + { + "auxiliary_loss_clip": 0.01150592, + "auxiliary_loss_mlp": 0.01094282, + "balance_loss_clip": 1.04339337, + "balance_loss_mlp": 1.01169372, + "epoch": 0.1677388324415319, + "flos": 23805507058560.0, + "grad_norm": 1.8989560744741765, + "language_loss": 0.88471663, + "learning_rate": 3.8043929260316137e-06, + "loss": 0.90716535, + "num_input_tokens_seen": 29458260, + "step": 1395, + "time_per_iteration": 2.841315269470215 + }, + { + "auxiliary_loss_clip": 0.01162494, + "auxiliary_loss_mlp": 0.01091367, + "balance_loss_clip": 1.04807925, + "balance_loss_mlp": 1.0087316, + "epoch": 0.16785907533217098, + "flos": 20558859431040.0, + "grad_norm": 2.0173754857110784, + "language_loss": 0.83795607, + "learning_rate": 3.8040567989731417e-06, + "loss": 0.86049473, + "num_input_tokens_seen": 29476205, + "step": 1396, + "time_per_iteration": 2.756588935852051 + }, + { + "auxiliary_loss_clip": 0.01169837, + "auxiliary_loss_mlp": 0.01090508, + "balance_loss_clip": 1.04600966, + "balance_loss_mlp": 1.00801551, + "epoch": 0.16797931822281006, + "flos": 15669657745920.0, + "grad_norm": 2.0596024870233083, + "language_loss": 0.8008703, + "learning_rate": 3.8037203982406876e-06, + "loss": 0.82347375, + "num_input_tokens_seen": 29494370, + "step": 1397, + "time_per_iteration": 2.7338738441467285 + }, + { + "auxiliary_loss_clip": 0.0118171, + "auxiliary_loss_mlp": 0.01091319, + "balance_loss_clip": 1.04833221, + "balance_loss_mlp": 1.00868273, + "epoch": 0.16809956111344918, + "flos": 16541101607040.0, + "grad_norm": 1.8115461117838056, + "language_loss": 0.73278606, + "learning_rate": 3.8033837238852835e-06, + "loss": 0.75551629, + "num_input_tokens_seen": 29511070, + "step": 1398, + "time_per_iteration": 2.6515145301818848 + }, + { + "auxiliary_loss_clip": 0.0115956, + "auxiliary_loss_mlp": 0.0109228, + "balance_loss_clip": 1.04373264, + "balance_loss_mlp": 1.00969207, + "epoch": 0.16821980400408826, + "flos": 23258084808960.0, + "grad_norm": 1.93057997708637, + "language_loss": 0.69387978, + "learning_rate": 3.8030467759580017e-06, + "loss": 0.71639818, + "num_input_tokens_seen": 29531990, + "step": 1399, + "time_per_iteration": 2.7899458408355713 + }, + { + "auxiliary_loss_clip": 0.01172314, + "auxiliary_loss_mlp": 0.01092145, + "balance_loss_clip": 1.04690671, + "balance_loss_mlp": 1.0096041, + "epoch": 0.16834004689472734, + "flos": 20774754126720.0, + "grad_norm": 2.3446477901168135, + "language_loss": 0.86885893, + "learning_rate": 3.802709554509958e-06, + "loss": 0.89150351, + "num_input_tokens_seen": 29549790, + "step": 1400, + "time_per_iteration": 2.640791654586792 + }, + { + "auxiliary_loss_clip": 0.01160915, + "auxiliary_loss_mlp": 0.01089735, + "balance_loss_clip": 1.04497862, + "balance_loss_mlp": 1.0072422, + "epoch": 0.16846028978536645, + "flos": 26687302289280.0, + "grad_norm": 1.6832706098674262, + "language_loss": 0.79839122, + "learning_rate": 3.8023720595923083e-06, + "loss": 0.82089776, + "num_input_tokens_seen": 29569045, + "step": 1401, + "time_per_iteration": 2.8278329372406006 + }, + { + "auxiliary_loss_clip": 0.01139421, + "auxiliary_loss_mlp": 0.01090087, + "balance_loss_clip": 1.04268169, + "balance_loss_mlp": 1.00788021, + "epoch": 0.16858053267600553, + "flos": 18843298980480.0, + "grad_norm": 2.336774192344846, + "language_loss": 0.87804443, + "learning_rate": 3.80203429125625e-06, + "loss": 0.90033948, + "num_input_tokens_seen": 29587220, + "step": 1402, + "time_per_iteration": 3.0971930027008057 + }, + { + "auxiliary_loss_clip": 0.01122108, + "auxiliary_loss_mlp": 0.01092225, + "balance_loss_clip": 1.03655577, + "balance_loss_mlp": 1.00977969, + "epoch": 0.16870077556664462, + "flos": 27744548227200.0, + "grad_norm": 1.8363250386877668, + "language_loss": 0.70147979, + "learning_rate": 3.8016962495530225e-06, + "loss": 0.72362316, + "num_input_tokens_seen": 29606410, + "step": 1403, + "time_per_iteration": 2.8994951248168945 + }, + { + "auxiliary_loss_clip": 0.01178359, + "auxiliary_loss_mlp": 0.01093464, + "balance_loss_clip": 1.04515147, + "balance_loss_mlp": 1.01063693, + "epoch": 0.1688210184572837, + "flos": 13730768484480.0, + "grad_norm": 2.4065025260220843, + "language_loss": 0.76378751, + "learning_rate": 3.8013579345339063e-06, + "loss": 0.78650576, + "num_input_tokens_seen": 29621275, + "step": 1404, + "time_per_iteration": 2.656294822692871 + }, + { + "auxiliary_loss_clip": 0.01148034, + "auxiliary_loss_mlp": 0.01091345, + "balance_loss_clip": 1.04227364, + "balance_loss_mlp": 1.00875723, + "epoch": 0.1689412613479228, + "flos": 26468785900800.0, + "grad_norm": 2.214031097601585, + "language_loss": 0.69554943, + "learning_rate": 3.801019346250224e-06, + "loss": 0.71794319, + "num_input_tokens_seen": 29641420, + "step": 1405, + "time_per_iteration": 2.999727964401245 + }, + { + "auxiliary_loss_clip": 0.01163551, + "auxiliary_loss_mlp": 0.01088759, + "balance_loss_clip": 1.04165101, + "balance_loss_mlp": 1.0063138, + "epoch": 0.1690615042385619, + "flos": 21138852337920.0, + "grad_norm": 2.1916166265998025, + "language_loss": 0.8362906, + "learning_rate": 3.8006804847533395e-06, + "loss": 0.8588137, + "num_input_tokens_seen": 29660935, + "step": 1406, + "time_per_iteration": 2.7992637157440186 + }, + { + "auxiliary_loss_clip": 0.01179688, + "auxiliary_loss_mlp": 0.01092221, + "balance_loss_clip": 1.04679406, + "balance_loss_mlp": 1.00968051, + "epoch": 0.16918174712920098, + "flos": 20849340718080.0, + "grad_norm": 1.8580537871634002, + "language_loss": 0.85189295, + "learning_rate": 3.8003413500946556e-06, + "loss": 0.87461197, + "num_input_tokens_seen": 29681045, + "step": 1407, + "time_per_iteration": 2.7142767906188965 + }, + { + "auxiliary_loss_clip": 0.01158228, + "auxiliary_loss_mlp": 0.01092217, + "balance_loss_clip": 1.04359901, + "balance_loss_mlp": 1.00948572, + "epoch": 0.1693019900198401, + "flos": 16983270028800.0, + "grad_norm": 2.4851939784264214, + "language_loss": 0.828246, + "learning_rate": 3.8000019423256216e-06, + "loss": 0.85075045, + "num_input_tokens_seen": 29698810, + "step": 1408, + "time_per_iteration": 2.740222930908203 + }, + { + "auxiliary_loss_clip": 0.01153919, + "auxiliary_loss_mlp": 0.01090148, + "balance_loss_clip": 1.04261136, + "balance_loss_mlp": 1.00779843, + "epoch": 0.16942223291047917, + "flos": 26796901662720.0, + "grad_norm": 1.7587251959061527, + "language_loss": 0.88389355, + "learning_rate": 3.7996622614977234e-06, + "loss": 0.90633422, + "num_input_tokens_seen": 29720000, + "step": 1409, + "time_per_iteration": 2.8049166202545166 + }, + { + "auxiliary_loss_clip": 0.01156269, + "auxiliary_loss_mlp": 0.0109439, + "balance_loss_clip": 1.04329395, + "balance_loss_mlp": 1.0120883, + "epoch": 0.16954247580111825, + "flos": 18583700411520.0, + "grad_norm": 1.8115209648317578, + "language_loss": 0.79121298, + "learning_rate": 3.799322307662492e-06, + "loss": 0.81371957, + "num_input_tokens_seen": 29737820, + "step": 1410, + "time_per_iteration": 2.7075417041778564 + }, + { + "auxiliary_loss_clip": 0.01140766, + "auxiliary_loss_mlp": 0.01091028, + "balance_loss_clip": 1.04401255, + "balance_loss_mlp": 1.00844014, + "epoch": 0.16966271869175734, + "flos": 13983651210240.0, + "grad_norm": 2.189739827598456, + "language_loss": 0.8390578, + "learning_rate": 3.798982080871496e-06, + "loss": 0.86137575, + "num_input_tokens_seen": 29752960, + "step": 1411, + "time_per_iteration": 2.870250940322876 + }, + { + "auxiliary_loss_clip": 0.01179047, + "auxiliary_loss_mlp": 0.01090058, + "balance_loss_clip": 1.04578304, + "balance_loss_mlp": 1.00737393, + "epoch": 0.16978296158239645, + "flos": 37487328284160.0, + "grad_norm": 2.0579010924898338, + "language_loss": 0.67995358, + "learning_rate": 3.798641581176349e-06, + "loss": 0.70264465, + "num_input_tokens_seen": 29775240, + "step": 1412, + "time_per_iteration": 2.80947208404541 + }, + { + "auxiliary_loss_clip": 0.011601, + "auxiliary_loss_mlp": 0.01087726, + "balance_loss_clip": 1.04452991, + "balance_loss_mlp": 1.00523281, + "epoch": 0.16990320447303553, + "flos": 28328958506880.0, + "grad_norm": 1.904724618465659, + "language_loss": 0.75037628, + "learning_rate": 3.7983008086287044e-06, + "loss": 0.77285451, + "num_input_tokens_seen": 29796560, + "step": 1413, + "time_per_iteration": 3.8472578525543213 + }, + { + "auxiliary_loss_clip": 0.01155821, + "auxiliary_loss_mlp": 0.01090374, + "balance_loss_clip": 1.04213142, + "balance_loss_mlp": 1.00769019, + "epoch": 0.1700234473636746, + "flos": 20188189031040.0, + "grad_norm": 2.2759916908095974, + "language_loss": 0.79238844, + "learning_rate": 3.797959763280257e-06, + "loss": 0.81485045, + "num_input_tokens_seen": 29815245, + "step": 1414, + "time_per_iteration": 2.7620439529418945 + }, + { + "auxiliary_loss_clip": 0.01171414, + "auxiliary_loss_mlp": 0.01091529, + "balance_loss_clip": 1.04664242, + "balance_loss_mlp": 1.00894129, + "epoch": 0.17014369025431372, + "flos": 24858658846080.0, + "grad_norm": 2.4659909177674515, + "language_loss": 0.79434156, + "learning_rate": 3.797618445182743e-06, + "loss": 0.81697106, + "num_input_tokens_seen": 29836640, + "step": 1415, + "time_per_iteration": 3.738999843597412 + }, + { + "auxiliary_loss_clip": 0.01140701, + "auxiliary_loss_mlp": 0.01092381, + "balance_loss_clip": 1.04280126, + "balance_loss_mlp": 1.00979304, + "epoch": 0.1702639331449528, + "flos": 16467233287680.0, + "grad_norm": 2.1247910157059353, + "language_loss": 0.84731066, + "learning_rate": 3.79727685438794e-06, + "loss": 0.86964148, + "num_input_tokens_seen": 29850830, + "step": 1416, + "time_per_iteration": 2.7873406410217285 + }, + { + "auxiliary_loss_clip": 0.01195868, + "auxiliary_loss_mlp": 0.01081018, + "balance_loss_clip": 1.0822897, + "balance_loss_mlp": 1.00090981, + "epoch": 0.1703841760355919, + "flos": 52508870979840.0, + "grad_norm": 0.8358540830852063, + "language_loss": 0.61711949, + "learning_rate": 3.796934990947667e-06, + "loss": 0.63988835, + "num_input_tokens_seen": 29912515, + "step": 1417, + "time_per_iteration": 4.200344085693359 + }, + { + "auxiliary_loss_clip": 0.01191047, + "auxiliary_loss_mlp": 0.01080769, + "balance_loss_clip": 1.08347392, + "balance_loss_mlp": 1.00066078, + "epoch": 0.170504418926231, + "flos": 49370637576960.0, + "grad_norm": 0.875121016961741, + "language_loss": 0.62537909, + "learning_rate": 3.7965928549137854e-06, + "loss": 0.64809728, + "num_input_tokens_seen": 29969330, + "step": 1418, + "time_per_iteration": 3.195084810256958 + }, + { + "auxiliary_loss_clip": 0.01149907, + "auxiliary_loss_mlp": 0.01091103, + "balance_loss_clip": 1.04219842, + "balance_loss_mlp": 1.00832438, + "epoch": 0.17062466181687008, + "flos": 25849219184640.0, + "grad_norm": 1.9925096240855025, + "language_loss": 0.77529728, + "learning_rate": 3.7962504463381953e-06, + "loss": 0.79770738, + "num_input_tokens_seen": 29990820, + "step": 1419, + "time_per_iteration": 3.825352430343628 + }, + { + "auxiliary_loss_clip": 0.0115558, + "auxiliary_loss_mlp": 0.00874643, + "balance_loss_clip": 1.04167974, + "balance_loss_mlp": 1.00019562, + "epoch": 0.17074490470750917, + "flos": 20960412549120.0, + "grad_norm": 1.7106208198588766, + "language_loss": 0.79154015, + "learning_rate": 3.7959077652728412e-06, + "loss": 0.81184244, + "num_input_tokens_seen": 30009275, + "step": 1420, + "time_per_iteration": 2.7670533657073975 + }, + { + "auxiliary_loss_clip": 0.01159584, + "auxiliary_loss_mlp": 0.01090586, + "balance_loss_clip": 1.04471731, + "balance_loss_mlp": 1.00790238, + "epoch": 0.17086514759814825, + "flos": 20959766104320.0, + "grad_norm": 2.056211707046765, + "language_loss": 0.77273035, + "learning_rate": 3.795564811769707e-06, + "loss": 0.79523206, + "num_input_tokens_seen": 30027630, + "step": 1421, + "time_per_iteration": 2.7307279109954834 + }, + { + "auxiliary_loss_clip": 0.01160011, + "auxiliary_loss_mlp": 0.01091568, + "balance_loss_clip": 1.0465194, + "balance_loss_mlp": 1.00921822, + "epoch": 0.17098539048878736, + "flos": 28474073452800.0, + "grad_norm": 1.9168983750764903, + "language_loss": 0.77625084, + "learning_rate": 3.795221585880818e-06, + "loss": 0.79876661, + "num_input_tokens_seen": 30048310, + "step": 1422, + "time_per_iteration": 2.8624045848846436 + }, + { + "auxiliary_loss_clip": 0.0114627, + "auxiliary_loss_mlp": 0.01090733, + "balance_loss_clip": 1.04456401, + "balance_loss_mlp": 1.00824022, + "epoch": 0.17110563337942644, + "flos": 16290014561280.0, + "grad_norm": 1.7440193962841626, + "language_loss": 0.91195101, + "learning_rate": 3.794878087658242e-06, + "loss": 0.93432105, + "num_input_tokens_seen": 30066080, + "step": 1423, + "time_per_iteration": 2.717115879058838 + }, + { + "auxiliary_loss_clip": 0.01171652, + "auxiliary_loss_mlp": 0.01089716, + "balance_loss_clip": 1.04685354, + "balance_loss_mlp": 1.007128, + "epoch": 0.17122587627006552, + "flos": 29674207693440.0, + "grad_norm": 1.7655369535651864, + "language_loss": 0.78640515, + "learning_rate": 3.7945343171540873e-06, + "loss": 0.80901885, + "num_input_tokens_seen": 30086955, + "step": 1424, + "time_per_iteration": 2.8430070877075195 + }, + { + "auxiliary_loss_clip": 0.01178351, + "auxiliary_loss_mlp": 0.01089081, + "balance_loss_clip": 1.04508162, + "balance_loss_mlp": 1.0065881, + "epoch": 0.17134611916070464, + "flos": 25338389915520.0, + "grad_norm": 2.045061748233552, + "language_loss": 0.79019356, + "learning_rate": 3.7941902744205033e-06, + "loss": 0.81286788, + "num_input_tokens_seen": 30107990, + "step": 1425, + "time_per_iteration": 2.6993134021759033 + }, + { + "auxiliary_loss_clip": 0.01161417, + "auxiliary_loss_mlp": 0.01092087, + "balance_loss_clip": 1.04650807, + "balance_loss_mlp": 1.00911736, + "epoch": 0.17146636205134372, + "flos": 13953845900160.0, + "grad_norm": 2.7408380058190662, + "language_loss": 0.83543837, + "learning_rate": 3.7938459595096817e-06, + "loss": 0.8579734, + "num_input_tokens_seen": 30126535, + "step": 1426, + "time_per_iteration": 2.773082971572876 + }, + { + "auxiliary_loss_clip": 0.01170859, + "auxiliary_loss_mlp": 0.01092106, + "balance_loss_clip": 1.04634845, + "balance_loss_mlp": 1.00923133, + "epoch": 0.1715866049419828, + "flos": 23915214172800.0, + "grad_norm": 1.9426946258588471, + "language_loss": 0.86224562, + "learning_rate": 3.7935013724738545e-06, + "loss": 0.8848753, + "num_input_tokens_seen": 30147035, + "step": 1427, + "time_per_iteration": 2.7662899494171143 + }, + { + "auxiliary_loss_clip": 0.01160611, + "auxiliary_loss_mlp": 0.01090629, + "balance_loss_clip": 1.04324389, + "balance_loss_mlp": 1.00804043, + "epoch": 0.17170684783262188, + "flos": 22709369669760.0, + "grad_norm": 1.618579319857975, + "language_loss": 0.78126419, + "learning_rate": 3.7931565133652945e-06, + "loss": 0.80377662, + "num_input_tokens_seen": 30167110, + "step": 1428, + "time_per_iteration": 2.682722806930542 + }, + { + "auxiliary_loss_clip": 0.01178348, + "auxiliary_loss_mlp": 0.01091255, + "balance_loss_clip": 1.0458225, + "balance_loss_mlp": 1.00861931, + "epoch": 0.171827090723261, + "flos": 26613290315520.0, + "grad_norm": 2.2231069996447586, + "language_loss": 0.68046671, + "learning_rate": 3.792811382236317e-06, + "loss": 0.70316273, + "num_input_tokens_seen": 30185620, + "step": 1429, + "time_per_iteration": 2.705543279647827 + }, + { + "auxiliary_loss_clip": 0.01171583, + "auxiliary_loss_mlp": 0.01090115, + "balance_loss_clip": 1.04700208, + "balance_loss_mlp": 1.0073843, + "epoch": 0.17194733361390008, + "flos": 28148507556480.0, + "grad_norm": 1.908748955271878, + "language_loss": 0.78140914, + "learning_rate": 3.792465979139279e-06, + "loss": 0.80402613, + "num_input_tokens_seen": 30208225, + "step": 1430, + "time_per_iteration": 2.7703375816345215 + }, + { + "auxiliary_loss_clip": 0.01174166, + "auxiliary_loss_mlp": 0.01080105, + "balance_loss_clip": 1.07897484, + "balance_loss_mlp": 0.99999601, + "epoch": 0.17206757650453916, + "flos": 65530689753600.0, + "grad_norm": 0.9258762519101091, + "language_loss": 0.65629828, + "learning_rate": 3.792120304126576e-06, + "loss": 0.67884088, + "num_input_tokens_seen": 30271600, + "step": 1431, + "time_per_iteration": 3.4886064529418945 + }, + { + "auxiliary_loss_clip": 0.01117481, + "auxiliary_loss_mlp": 0.01089031, + "balance_loss_clip": 1.03907871, + "balance_loss_mlp": 1.0065856, + "epoch": 0.17218781939517827, + "flos": 22273486128000.0, + "grad_norm": 2.8175179479238404, + "language_loss": 0.83951521, + "learning_rate": 3.791774357250649e-06, + "loss": 0.86158037, + "num_input_tokens_seen": 30290430, + "step": 1432, + "time_per_iteration": 2.9024388790130615 + }, + { + "auxiliary_loss_clip": 0.0115318, + "auxiliary_loss_mlp": 0.01092098, + "balance_loss_clip": 1.04019201, + "balance_loss_mlp": 1.00950992, + "epoch": 0.17230806228581735, + "flos": 14137313592960.0, + "grad_norm": 2.122807523672742, + "language_loss": 0.78820586, + "learning_rate": 3.7914281385639757e-06, + "loss": 0.81065857, + "num_input_tokens_seen": 30308305, + "step": 1433, + "time_per_iteration": 2.7821459770202637 + }, + { + "auxiliary_loss_clip": 0.01168987, + "auxiliary_loss_mlp": 0.01091493, + "balance_loss_clip": 1.0439831, + "balance_loss_mlp": 1.00909567, + "epoch": 0.17242830517645644, + "flos": 20704836303360.0, + "grad_norm": 1.859849245677059, + "language_loss": 0.79541087, + "learning_rate": 3.7910816481190784e-06, + "loss": 0.81801569, + "num_input_tokens_seen": 30328120, + "step": 1434, + "time_per_iteration": 2.75771427154541 + }, + { + "auxiliary_loss_clip": 0.01154808, + "auxiliary_loss_mlp": 0.01090964, + "balance_loss_clip": 1.04336786, + "balance_loss_mlp": 1.00818503, + "epoch": 0.17254854806709552, + "flos": 30774582887040.0, + "grad_norm": 2.063436229861691, + "language_loss": 0.7452929, + "learning_rate": 3.7907348859685193e-06, + "loss": 0.76775062, + "num_input_tokens_seen": 30349825, + "step": 1435, + "time_per_iteration": 2.8523995876312256 + }, + { + "auxiliary_loss_clip": 0.0116091, + "auxiliary_loss_mlp": 0.0109004, + "balance_loss_clip": 1.04057562, + "balance_loss_mlp": 1.00754702, + "epoch": 0.17266879095773463, + "flos": 26614726859520.0, + "grad_norm": 1.9101315231027227, + "language_loss": 0.80580091, + "learning_rate": 3.790387852164902e-06, + "loss": 0.82831043, + "num_input_tokens_seen": 30370555, + "step": 1436, + "time_per_iteration": 2.8404862880706787 + }, + { + "auxiliary_loss_clip": 0.01170809, + "auxiliary_loss_mlp": 0.01091849, + "balance_loss_clip": 1.04718292, + "balance_loss_mlp": 1.00940371, + "epoch": 0.1727890338483737, + "flos": 20266295155200.0, + "grad_norm": 1.763608531045125, + "language_loss": 0.76477695, + "learning_rate": 3.7900405467608707e-06, + "loss": 0.78740358, + "num_input_tokens_seen": 30390100, + "step": 1437, + "time_per_iteration": 2.759882926940918 + }, + { + "auxiliary_loss_clip": 0.01135275, + "auxiliary_loss_mlp": 0.01090081, + "balance_loss_clip": 1.04084337, + "balance_loss_mlp": 1.00734985, + "epoch": 0.1729092767390128, + "flos": 18179812909440.0, + "grad_norm": 3.0291789945500436, + "language_loss": 0.79442078, + "learning_rate": 3.7896929698091114e-06, + "loss": 0.81667429, + "num_input_tokens_seen": 30402915, + "step": 1438, + "time_per_iteration": 3.73872447013855 + }, + { + "auxiliary_loss_clip": 0.01181393, + "auxiliary_loss_mlp": 0.01090703, + "balance_loss_clip": 1.04909587, + "balance_loss_mlp": 1.00820994, + "epoch": 0.1730295196296519, + "flos": 26759518583040.0, + "grad_norm": 2.8938589626608318, + "language_loss": 0.68589401, + "learning_rate": 3.7893451213623518e-06, + "loss": 0.70861495, + "num_input_tokens_seen": 30420145, + "step": 1439, + "time_per_iteration": 2.737086534500122 + }, + { + "auxiliary_loss_clip": 0.01159934, + "auxiliary_loss_mlp": 0.0087462, + "balance_loss_clip": 1.03957367, + "balance_loss_mlp": 1.00024009, + "epoch": 0.173149762520291, + "flos": 23842531002240.0, + "grad_norm": 2.2469963588370896, + "language_loss": 0.82632661, + "learning_rate": 3.7889970014733606e-06, + "loss": 0.84667212, + "num_input_tokens_seen": 30439250, + "step": 1440, + "time_per_iteration": 3.69710111618042 + }, + { + "auxiliary_loss_clip": 0.01135672, + "auxiliary_loss_mlp": 0.01088851, + "balance_loss_clip": 1.04130125, + "balance_loss_mlp": 1.00640619, + "epoch": 0.17327000541093007, + "flos": 23368186972800.0, + "grad_norm": 1.7359895347034768, + "language_loss": 0.78183484, + "learning_rate": 3.7886486101949463e-06, + "loss": 0.80408007, + "num_input_tokens_seen": 30460430, + "step": 1441, + "time_per_iteration": 2.8563690185546875 + }, + { + "auxiliary_loss_clip": 0.01141111, + "auxiliary_loss_mlp": 0.01091711, + "balance_loss_clip": 1.04546356, + "balance_loss_mlp": 1.00921857, + "epoch": 0.17339024830156918, + "flos": 18221290139520.0, + "grad_norm": 1.9586462425443751, + "language_loss": 0.88114095, + "learning_rate": 3.7882999475799594e-06, + "loss": 0.9034692, + "num_input_tokens_seen": 30478465, + "step": 1442, + "time_per_iteration": 2.820664644241333 + }, + { + "auxiliary_loss_clip": 0.01138902, + "auxiliary_loss_mlp": 0.01091673, + "balance_loss_clip": 1.04279304, + "balance_loss_mlp": 1.00913274, + "epoch": 0.17351049119220827, + "flos": 23332024955520.0, + "grad_norm": 1.9747872373507205, + "language_loss": 0.81629312, + "learning_rate": 3.787951013681293e-06, + "loss": 0.83859885, + "num_input_tokens_seen": 30496510, + "step": 1443, + "time_per_iteration": 3.7522594928741455 + }, + { + "auxiliary_loss_clip": 0.01171599, + "auxiliary_loss_mlp": 0.01092101, + "balance_loss_clip": 1.04760695, + "balance_loss_mlp": 1.00908399, + "epoch": 0.17363073408284735, + "flos": 23803495896960.0, + "grad_norm": 1.8472690891306651, + "language_loss": 0.7766028, + "learning_rate": 3.787601808551879e-06, + "loss": 0.79923975, + "num_input_tokens_seen": 30516325, + "step": 1444, + "time_per_iteration": 3.6978917121887207 + }, + { + "auxiliary_loss_clip": 0.0114698, + "auxiliary_loss_mlp": 0.01091213, + "balance_loss_clip": 1.04216897, + "balance_loss_mlp": 1.00824308, + "epoch": 0.17375097697348643, + "flos": 18515290959360.0, + "grad_norm": 2.2342899389768527, + "language_loss": 0.83699185, + "learning_rate": 3.7872523322446926e-06, + "loss": 0.85937375, + "num_input_tokens_seen": 30535210, + "step": 1445, + "time_per_iteration": 2.760422706604004 + }, + { + "auxiliary_loss_clip": 0.01138032, + "auxiliary_loss_mlp": 0.01090177, + "balance_loss_clip": 1.04064417, + "balance_loss_mlp": 1.0075407, + "epoch": 0.17387121986412554, + "flos": 38877897456000.0, + "grad_norm": 1.5632374735437364, + "language_loss": 0.5979811, + "learning_rate": 3.7869025848127478e-06, + "loss": 0.62026322, + "num_input_tokens_seen": 30559405, + "step": 1446, + "time_per_iteration": 3.0669384002685547 + }, + { + "auxiliary_loss_clip": 0.01170371, + "auxiliary_loss_mlp": 0.01091568, + "balance_loss_clip": 1.04665291, + "balance_loss_mlp": 1.00907493, + "epoch": 0.17399146275476463, + "flos": 20375714960640.0, + "grad_norm": 3.1626453529410528, + "language_loss": 0.80627716, + "learning_rate": 3.786552566309102e-06, + "loss": 0.82889652, + "num_input_tokens_seen": 30577615, + "step": 1447, + "time_per_iteration": 2.826425075531006 + }, + { + "auxiliary_loss_clip": 0.01143993, + "auxiliary_loss_mlp": 0.00874645, + "balance_loss_clip": 1.04023433, + "balance_loss_mlp": 1.00021243, + "epoch": 0.1741117056454037, + "flos": 19164339763200.0, + "grad_norm": 2.1045155311382233, + "language_loss": 0.85894918, + "learning_rate": 3.7862022767868517e-06, + "loss": 0.87913555, + "num_input_tokens_seen": 30595205, + "step": 1448, + "time_per_iteration": 2.814462423324585 + }, + { + "auxiliary_loss_clip": 0.01138466, + "auxiliary_loss_mlp": 0.01092379, + "balance_loss_clip": 1.03728247, + "balance_loss_mlp": 1.00969589, + "epoch": 0.17423194853604282, + "flos": 25374300537600.0, + "grad_norm": 1.9404188518381154, + "language_loss": 0.84144014, + "learning_rate": 3.7858517162991367e-06, + "loss": 0.86374861, + "num_input_tokens_seen": 30615280, + "step": 1449, + "time_per_iteration": 2.847841262817383 + }, + { + "auxiliary_loss_clip": 0.01149977, + "auxiliary_loss_mlp": 0.0109197, + "balance_loss_clip": 1.04263425, + "balance_loss_mlp": 1.00938225, + "epoch": 0.1743521914266819, + "flos": 25191874339200.0, + "grad_norm": 2.46016445934253, + "language_loss": 0.60941511, + "learning_rate": 3.7855008848991363e-06, + "loss": 0.63183463, + "num_input_tokens_seen": 30633485, + "step": 1450, + "time_per_iteration": 2.8076043128967285 + }, + { + "auxiliary_loss_clip": 0.0115755, + "auxiliary_loss_mlp": 0.0109, + "balance_loss_clip": 1.04466617, + "balance_loss_mlp": 1.00750709, + "epoch": 0.17447243431732098, + "flos": 25666577504640.0, + "grad_norm": 1.759083867443117, + "language_loss": 0.77726758, + "learning_rate": 3.7851497826400714e-06, + "loss": 0.79974306, + "num_input_tokens_seen": 30653625, + "step": 1451, + "time_per_iteration": 2.810426950454712 + }, + { + "auxiliary_loss_clip": 0.01179365, + "auxiliary_loss_mlp": 0.01089116, + "balance_loss_clip": 1.04808688, + "balance_loss_mlp": 1.00643206, + "epoch": 0.17459267720796007, + "flos": 36281950657920.0, + "grad_norm": 1.8198293280490148, + "language_loss": 0.75925064, + "learning_rate": 3.7847984095752034e-06, + "loss": 0.78193545, + "num_input_tokens_seen": 30677080, + "step": 1452, + "time_per_iteration": 2.856782913208008 + }, + { + "auxiliary_loss_clip": 0.01176775, + "auxiliary_loss_mlp": 0.01090158, + "balance_loss_clip": 1.04488373, + "balance_loss_mlp": 1.00747442, + "epoch": 0.17471292009859918, + "flos": 20011113959040.0, + "grad_norm": 2.040757633590836, + "language_loss": 0.80399919, + "learning_rate": 3.784446765757836e-06, + "loss": 0.8266685, + "num_input_tokens_seen": 30695725, + "step": 1453, + "time_per_iteration": 2.7025747299194336 + }, + { + "auxiliary_loss_clip": 0.01140518, + "auxiliary_loss_mlp": 0.01090612, + "balance_loss_clip": 1.0405525, + "balance_loss_mlp": 1.00788105, + "epoch": 0.17483316298923826, + "flos": 27819242559360.0, + "grad_norm": 2.1202551502872073, + "language_loss": 0.77487868, + "learning_rate": 3.7840948512413133e-06, + "loss": 0.79718995, + "num_input_tokens_seen": 30713310, + "step": 1454, + "time_per_iteration": 2.864316940307617 + }, + { + "auxiliary_loss_clip": 0.01134593, + "auxiliary_loss_mlp": 0.01090983, + "balance_loss_clip": 1.03543425, + "balance_loss_mlp": 1.00820434, + "epoch": 0.17495340587987734, + "flos": 44017934791680.0, + "grad_norm": 1.8231186267846942, + "language_loss": 0.78733456, + "learning_rate": 3.7837426660790196e-06, + "loss": 0.80959028, + "num_input_tokens_seen": 30734725, + "step": 1455, + "time_per_iteration": 2.973055124282837 + }, + { + "auxiliary_loss_clip": 0.01176099, + "auxiliary_loss_mlp": 0.0108997, + "balance_loss_clip": 1.04495716, + "balance_loss_mlp": 1.00752509, + "epoch": 0.17507364877051645, + "flos": 20885825957760.0, + "grad_norm": 1.8889109424065431, + "language_loss": 0.82114279, + "learning_rate": 3.783390210324382e-06, + "loss": 0.84380347, + "num_input_tokens_seen": 30754450, + "step": 1456, + "time_per_iteration": 2.710662603378296 + }, + { + "auxiliary_loss_clip": 0.01145153, + "auxiliary_loss_mlp": 0.01090607, + "balance_loss_clip": 1.04282022, + "balance_loss_mlp": 1.00792325, + "epoch": 0.17519389166115554, + "flos": 24717602136960.0, + "grad_norm": 1.7954890624891255, + "language_loss": 0.7273581, + "learning_rate": 3.7830374840308676e-06, + "loss": 0.74971569, + "num_input_tokens_seen": 30774605, + "step": 1457, + "time_per_iteration": 2.7995851039886475 + }, + { + "auxiliary_loss_clip": 0.01167308, + "auxiliary_loss_mlp": 0.01093057, + "balance_loss_clip": 1.04520631, + "balance_loss_mlp": 1.01023018, + "epoch": 0.17531413455179462, + "flos": 23798144770560.0, + "grad_norm": 3.0416092853097303, + "language_loss": 0.8247751, + "learning_rate": 3.7826844872519842e-06, + "loss": 0.84737873, + "num_input_tokens_seen": 30792460, + "step": 1458, + "time_per_iteration": 2.8217849731445312 + }, + { + "auxiliary_loss_clip": 0.01158045, + "auxiliary_loss_mlp": 0.01089215, + "balance_loss_clip": 1.04573655, + "balance_loss_mlp": 1.00700879, + "epoch": 0.1754343774424337, + "flos": 24572379450240.0, + "grad_norm": 1.7670141260173418, + "language_loss": 0.73052323, + "learning_rate": 3.782331220041282e-06, + "loss": 0.75299579, + "num_input_tokens_seen": 30812525, + "step": 1459, + "time_per_iteration": 2.7644996643066406 + }, + { + "auxiliary_loss_clip": 0.01151069, + "auxiliary_loss_mlp": 0.01091387, + "balance_loss_clip": 1.04485238, + "balance_loss_mlp": 1.00865579, + "epoch": 0.17555462033307281, + "flos": 18114599767680.0, + "grad_norm": 3.5305852540614966, + "language_loss": 0.82954144, + "learning_rate": 3.7819776824523504e-06, + "loss": 0.85196602, + "num_input_tokens_seen": 30830390, + "step": 1460, + "time_per_iteration": 2.742133617401123 + }, + { + "auxiliary_loss_clip": 0.01160201, + "auxiliary_loss_mlp": 0.01090558, + "balance_loss_clip": 1.04572105, + "balance_loss_mlp": 1.00777888, + "epoch": 0.1756748632237119, + "flos": 28366018364160.0, + "grad_norm": 1.8879583582637112, + "language_loss": 0.83806556, + "learning_rate": 3.7816238745388213e-06, + "loss": 0.86057311, + "num_input_tokens_seen": 30849935, + "step": 1461, + "time_per_iteration": 2.7910783290863037 + }, + { + "auxiliary_loss_clip": 0.01157764, + "auxiliary_loss_mlp": 0.01091777, + "balance_loss_clip": 1.0424782, + "balance_loss_mlp": 1.00928378, + "epoch": 0.17579510611435098, + "flos": 25732939881600.0, + "grad_norm": 1.9963737714915153, + "language_loss": 0.86892724, + "learning_rate": 3.781269796354367e-06, + "loss": 0.89142263, + "num_input_tokens_seen": 30869555, + "step": 1462, + "time_per_iteration": 2.7848339080810547 + }, + { + "auxiliary_loss_clip": 0.01152679, + "auxiliary_loss_mlp": 0.01092102, + "balance_loss_clip": 1.04203987, + "balance_loss_mlp": 1.00941861, + "epoch": 0.1759153490049901, + "flos": 18588081870720.0, + "grad_norm": 1.7847944101522921, + "language_loss": 0.85962498, + "learning_rate": 3.7809154479527006e-06, + "loss": 0.88207281, + "num_input_tokens_seen": 30888760, + "step": 1463, + "time_per_iteration": 2.785240650177002 + }, + { + "auxiliary_loss_clip": 0.01138619, + "auxiliary_loss_mlp": 0.01091496, + "balance_loss_clip": 1.04169989, + "balance_loss_mlp": 1.00890768, + "epoch": 0.17603559189562917, + "flos": 18619323724800.0, + "grad_norm": 2.1600079735468998, + "language_loss": 0.84379041, + "learning_rate": 3.780560829387577e-06, + "loss": 0.86609149, + "num_input_tokens_seen": 30907260, + "step": 1464, + "time_per_iteration": 3.7233469486236572 + }, + { + "auxiliary_loss_clip": 0.01174021, + "auxiliary_loss_mlp": 0.0108033, + "balance_loss_clip": 1.06329429, + "balance_loss_mlp": 1.00022185, + "epoch": 0.17615583478626826, + "flos": 60530775373440.0, + "grad_norm": 0.8517079649269331, + "language_loss": 0.57901764, + "learning_rate": 3.7802059407127915e-06, + "loss": 0.60156113, + "num_input_tokens_seen": 30965810, + "step": 1465, + "time_per_iteration": 4.173717737197876 + }, + { + "auxiliary_loss_clip": 0.01159997, + "auxiliary_loss_mlp": 0.01089948, + "balance_loss_clip": 1.04463148, + "balance_loss_mlp": 1.00745511, + "epoch": 0.17627607767690734, + "flos": 23616221362560.0, + "grad_norm": 2.0922508981035106, + "language_loss": 0.86188978, + "learning_rate": 3.7798507819821797e-06, + "loss": 0.88438928, + "num_input_tokens_seen": 30982935, + "step": 1466, + "time_per_iteration": 2.765293836593628 + }, + { + "auxiliary_loss_clip": 0.01139988, + "auxiliary_loss_mlp": 0.01089359, + "balance_loss_clip": 1.03921413, + "balance_loss_mlp": 1.00667596, + "epoch": 0.17639632056754645, + "flos": 17639070589440.0, + "grad_norm": 2.1336422839186016, + "language_loss": 0.79202175, + "learning_rate": 3.7794953532496197e-06, + "loss": 0.8143152, + "num_input_tokens_seen": 30998840, + "step": 1467, + "time_per_iteration": 2.7273945808410645 + }, + { + "auxiliary_loss_clip": 0.01125726, + "auxiliary_loss_mlp": 0.00873945, + "balance_loss_clip": 1.04207683, + "balance_loss_mlp": 1.00018978, + "epoch": 0.17651656345818553, + "flos": 57932604910080.0, + "grad_norm": 0.8975870663456361, + "language_loss": 0.57988143, + "learning_rate": 3.7791396545690295e-06, + "loss": 0.59987819, + "num_input_tokens_seen": 31060075, + "step": 1468, + "time_per_iteration": 4.2809484004974365 + }, + { + "auxiliary_loss_clip": 0.01167241, + "auxiliary_loss_mlp": 0.01091479, + "balance_loss_clip": 1.04585671, + "balance_loss_mlp": 1.00893879, + "epoch": 0.17663680634882462, + "flos": 22929502170240.0, + "grad_norm": 1.9082345662038285, + "language_loss": 0.80949271, + "learning_rate": 3.7787836859943685e-06, + "loss": 0.83207989, + "num_input_tokens_seen": 31078800, + "step": 1469, + "time_per_iteration": 2.7514283657073975 + }, + { + "auxiliary_loss_clip": 0.01164344, + "auxiliary_loss_mlp": 0.0108916, + "balance_loss_clip": 1.04288375, + "balance_loss_mlp": 1.00652385, + "epoch": 0.17675704923946373, + "flos": 22637979388800.0, + "grad_norm": 2.4052928518930887, + "language_loss": 0.79104352, + "learning_rate": 3.7784274475796363e-06, + "loss": 0.81357855, + "num_input_tokens_seen": 31097430, + "step": 1470, + "time_per_iteration": 3.6560142040252686 + }, + { + "auxiliary_loss_clip": 0.01138176, + "auxiliary_loss_mlp": 0.01090164, + "balance_loss_clip": 1.04055274, + "balance_loss_mlp": 1.00738478, + "epoch": 0.1768772921301028, + "flos": 27126525795840.0, + "grad_norm": 2.1625906692763097, + "language_loss": 0.76555276, + "learning_rate": 3.7780709393788745e-06, + "loss": 0.78783613, + "num_input_tokens_seen": 31117905, + "step": 1471, + "time_per_iteration": 2.852095365524292 + }, + { + "auxiliary_loss_clip": 0.0117569, + "auxiliary_loss_mlp": 0.01091628, + "balance_loss_clip": 1.04534137, + "balance_loss_mlp": 1.00889635, + "epoch": 0.1769975350207419, + "flos": 19172133014400.0, + "grad_norm": 2.082800530744631, + "language_loss": 0.753016, + "learning_rate": 3.777714161446165e-06, + "loss": 0.77568918, + "num_input_tokens_seen": 31137610, + "step": 1472, + "time_per_iteration": 2.6538405418395996 + }, + { + "auxiliary_loss_clip": 0.01166753, + "auxiliary_loss_mlp": 0.01089291, + "balance_loss_clip": 1.04425359, + "balance_loss_mlp": 1.00660789, + "epoch": 0.177117777911381, + "flos": 36134932291200.0, + "grad_norm": 13.749911176540278, + "language_loss": 0.69103801, + "learning_rate": 3.7773571138356304e-06, + "loss": 0.71359849, + "num_input_tokens_seen": 31157780, + "step": 1473, + "time_per_iteration": 2.8107218742370605 + }, + { + "auxiliary_loss_clip": 0.01124962, + "auxiliary_loss_mlp": 0.01090845, + "balance_loss_clip": 1.03840661, + "balance_loss_mlp": 1.00839996, + "epoch": 0.17723802080202009, + "flos": 22090593052800.0, + "grad_norm": 2.0938471512099297, + "language_loss": 0.89211351, + "learning_rate": 3.776999796601435e-06, + "loss": 0.91427159, + "num_input_tokens_seen": 31176540, + "step": 1474, + "time_per_iteration": 2.893460512161255 + }, + { + "auxiliary_loss_clip": 0.01167393, + "auxiliary_loss_mlp": 0.01088656, + "balance_loss_clip": 1.04487741, + "balance_loss_mlp": 1.00630641, + "epoch": 0.17735826369265917, + "flos": 30222671437440.0, + "grad_norm": 1.9925196637179223, + "language_loss": 0.72955918, + "learning_rate": 3.776642209797783e-06, + "loss": 0.75211966, + "num_input_tokens_seen": 31198370, + "step": 1475, + "time_per_iteration": 2.7626969814300537 + }, + { + "auxiliary_loss_clip": 0.011672, + "auxiliary_loss_mlp": 0.01092814, + "balance_loss_clip": 1.04375362, + "balance_loss_mlp": 1.0100354, + "epoch": 0.17747850658329825, + "flos": 21397588980480.0, + "grad_norm": 2.1612862601574556, + "language_loss": 0.77843595, + "learning_rate": 3.7762843534789205e-06, + "loss": 0.80103606, + "num_input_tokens_seen": 31217120, + "step": 1476, + "time_per_iteration": 2.7307257652282715 + }, + { + "auxiliary_loss_clip": 0.01157709, + "auxiliary_loss_mlp": 0.01089349, + "balance_loss_clip": 1.04307127, + "balance_loss_mlp": 1.00671339, + "epoch": 0.17759874947393736, + "flos": 16983341856000.0, + "grad_norm": 2.107811795617832, + "language_loss": 0.88550806, + "learning_rate": 3.7759262276991343e-06, + "loss": 0.90797865, + "num_input_tokens_seen": 31234730, + "step": 1477, + "time_per_iteration": 2.7205092906951904 + }, + { + "auxiliary_loss_clip": 0.01154729, + "auxiliary_loss_mlp": 0.01091512, + "balance_loss_clip": 1.0422473, + "balance_loss_mlp": 1.00873291, + "epoch": 0.17771899236457644, + "flos": 11546107390080.0, + "grad_norm": 2.377999996575465, + "language_loss": 0.80560225, + "learning_rate": 3.7755678325127506e-06, + "loss": 0.82806462, + "num_input_tokens_seen": 31252410, + "step": 1478, + "time_per_iteration": 2.812819242477417 + }, + { + "auxiliary_loss_clip": 0.01121795, + "auxiliary_loss_mlp": 0.01088095, + "balance_loss_clip": 1.03200388, + "balance_loss_mlp": 1.00574577, + "epoch": 0.17783923525521553, + "flos": 18807747494400.0, + "grad_norm": 1.9131544510333514, + "language_loss": 0.75615513, + "learning_rate": 3.7752091679741393e-06, + "loss": 0.77825403, + "num_input_tokens_seen": 31270200, + "step": 1479, + "time_per_iteration": 2.8713955879211426 + }, + { + "auxiliary_loss_clip": 0.01163443, + "auxiliary_loss_mlp": 0.01091698, + "balance_loss_clip": 1.04326892, + "balance_loss_mlp": 1.00901484, + "epoch": 0.17795947814585464, + "flos": 30408365773440.0, + "grad_norm": 2.6535373548825607, + "language_loss": 0.77411199, + "learning_rate": 3.774850234137708e-06, + "loss": 0.7966634, + "num_input_tokens_seen": 31287495, + "step": 1480, + "time_per_iteration": 2.7494351863861084 + }, + { + "auxiliary_loss_clip": 0.01166837, + "auxiliary_loss_mlp": 0.01093319, + "balance_loss_clip": 1.04411316, + "balance_loss_mlp": 1.01068354, + "epoch": 0.17807972103649372, + "flos": 24389055411840.0, + "grad_norm": 2.3688363261564613, + "language_loss": 0.82831192, + "learning_rate": 3.7744910310579076e-06, + "loss": 0.85091347, + "num_input_tokens_seen": 31306420, + "step": 1481, + "time_per_iteration": 2.848665475845337 + }, + { + "auxiliary_loss_clip": 0.01176012, + "auxiliary_loss_mlp": 0.01089787, + "balance_loss_clip": 1.04558301, + "balance_loss_mlp": 1.00753307, + "epoch": 0.1781999639271328, + "flos": 20301559332480.0, + "grad_norm": 2.0853061852763446, + "language_loss": 0.85163313, + "learning_rate": 3.774131558789229e-06, + "loss": 0.87429112, + "num_input_tokens_seen": 31325750, + "step": 1482, + "time_per_iteration": 2.685213327407837 + }, + { + "auxiliary_loss_clip": 0.01176415, + "auxiliary_loss_mlp": 0.00874471, + "balance_loss_clip": 1.0461328, + "balance_loss_mlp": 1.00027299, + "epoch": 0.1783202068177719, + "flos": 15924479806080.0, + "grad_norm": 3.0973697284541997, + "language_loss": 0.70190817, + "learning_rate": 3.773771817386203e-06, + "loss": 0.72241706, + "num_input_tokens_seen": 31343080, + "step": 1483, + "time_per_iteration": 2.6683969497680664 + }, + { + "auxiliary_loss_clip": 0.01152205, + "auxiliary_loss_mlp": 0.01091163, + "balance_loss_clip": 1.04012656, + "balance_loss_mlp": 1.00876522, + "epoch": 0.178440449708411, + "flos": 20631758083200.0, + "grad_norm": 1.527487450825801, + "language_loss": 0.79540563, + "learning_rate": 3.773411806903403e-06, + "loss": 0.81783926, + "num_input_tokens_seen": 31362160, + "step": 1484, + "time_per_iteration": 2.7540009021759033 + }, + { + "auxiliary_loss_clip": 0.01124739, + "auxiliary_loss_mlp": 0.0109073, + "balance_loss_clip": 1.04176772, + "balance_loss_mlp": 1.0082376, + "epoch": 0.17856069259905008, + "flos": 21686059105920.0, + "grad_norm": 1.897214515597753, + "language_loss": 0.94561517, + "learning_rate": 3.7730515273954415e-06, + "loss": 0.9677698, + "num_input_tokens_seen": 31380770, + "step": 1485, + "time_per_iteration": 2.9082839488983154 + }, + { + "auxiliary_loss_clip": 0.01176271, + "auxiliary_loss_mlp": 0.01090812, + "balance_loss_clip": 1.04575384, + "balance_loss_mlp": 1.00831926, + "epoch": 0.17868093548968916, + "flos": 26572962320640.0, + "grad_norm": 1.943722078180287, + "language_loss": 0.85067159, + "learning_rate": 3.772690978916973e-06, + "loss": 0.87334245, + "num_input_tokens_seen": 31400525, + "step": 1486, + "time_per_iteration": 2.720207452774048 + }, + { + "auxiliary_loss_clip": 0.01160874, + "auxiliary_loss_mlp": 0.01089848, + "balance_loss_clip": 1.04032075, + "balance_loss_mlp": 1.00735569, + "epoch": 0.17880117838032827, + "flos": 18581006891520.0, + "grad_norm": 6.911017048620604, + "language_loss": 0.86819077, + "learning_rate": 3.772330161522693e-06, + "loss": 0.89069796, + "num_input_tokens_seen": 31418435, + "step": 1487, + "time_per_iteration": 2.712289810180664 + }, + { + "auxiliary_loss_clip": 0.01146497, + "auxiliary_loss_mlp": 0.01091305, + "balance_loss_clip": 1.04118299, + "balance_loss_mlp": 1.00871634, + "epoch": 0.17892142127096736, + "flos": 26541217676160.0, + "grad_norm": 2.0797564311937085, + "language_loss": 0.79829198, + "learning_rate": 3.7719690752673365e-06, + "loss": 0.82067001, + "num_input_tokens_seen": 31439230, + "step": 1488, + "time_per_iteration": 2.7682876586914062 + }, + { + "auxiliary_loss_clip": 0.01144635, + "auxiliary_loss_mlp": 0.0109121, + "balance_loss_clip": 1.04204535, + "balance_loss_mlp": 1.00881243, + "epoch": 0.17904166416160644, + "flos": 23872623621120.0, + "grad_norm": 1.8982715340244984, + "language_loss": 0.78430659, + "learning_rate": 3.7716077202056796e-06, + "loss": 0.80666506, + "num_input_tokens_seen": 31457705, + "step": 1489, + "time_per_iteration": 3.7908082008361816 + }, + { + "auxiliary_loss_clip": 0.01150803, + "auxiliary_loss_mlp": 0.01088621, + "balance_loss_clip": 1.0421077, + "balance_loss_mlp": 1.00608087, + "epoch": 0.17916190705224552, + "flos": 19134426712320.0, + "grad_norm": 2.2910716680095504, + "language_loss": 0.93838775, + "learning_rate": 3.7712460963925404e-06, + "loss": 0.96078205, + "num_input_tokens_seen": 31473645, + "step": 1490, + "time_per_iteration": 3.6583003997802734 + }, + { + "auxiliary_loss_clip": 0.01152193, + "auxiliary_loss_mlp": 0.01089344, + "balance_loss_clip": 1.04120827, + "balance_loss_mlp": 1.0071373, + "epoch": 0.17928214994288463, + "flos": 25152120961920.0, + "grad_norm": 1.7816673171147106, + "language_loss": 0.75667751, + "learning_rate": 3.7708842038827775e-06, + "loss": 0.77909285, + "num_input_tokens_seen": 31492605, + "step": 1491, + "time_per_iteration": 2.8045012950897217 + }, + { + "auxiliary_loss_clip": 0.01163801, + "auxiliary_loss_mlp": 0.01090089, + "balance_loss_clip": 1.04251444, + "balance_loss_mlp": 1.00773954, + "epoch": 0.17940239283352372, + "flos": 22384629786240.0, + "grad_norm": 2.4457055238719185, + "language_loss": 0.85751879, + "learning_rate": 3.770522042731288e-06, + "loss": 0.88005769, + "num_input_tokens_seen": 31514500, + "step": 1492, + "time_per_iteration": 2.7706751823425293 + }, + { + "auxiliary_loss_clip": 0.01131486, + "auxiliary_loss_mlp": 0.01091621, + "balance_loss_clip": 1.04036641, + "balance_loss_mlp": 1.0090332, + "epoch": 0.1795226357241628, + "flos": 23178685795200.0, + "grad_norm": 2.103529015415088, + "language_loss": 0.88121378, + "learning_rate": 3.7701596129930122e-06, + "loss": 0.90344489, + "num_input_tokens_seen": 31533225, + "step": 1493, + "time_per_iteration": 2.8341946601867676 + }, + { + "auxiliary_loss_clip": 0.01146238, + "auxiliary_loss_mlp": 0.01091539, + "balance_loss_clip": 1.04249358, + "balance_loss_mlp": 1.00895095, + "epoch": 0.1796428786148019, + "flos": 22090413484800.0, + "grad_norm": 2.6721306465822483, + "language_loss": 0.73668826, + "learning_rate": 3.7697969147229315e-06, + "loss": 0.75906605, + "num_input_tokens_seen": 31551385, + "step": 1494, + "time_per_iteration": 3.720306396484375 + }, + { + "auxiliary_loss_clip": 0.01164365, + "auxiliary_loss_mlp": 0.01090955, + "balance_loss_clip": 1.04333901, + "balance_loss_mlp": 1.0086534, + "epoch": 0.179763121505441, + "flos": 21324618501120.0, + "grad_norm": 1.872436238819421, + "language_loss": 0.85697585, + "learning_rate": 3.7694339479760647e-06, + "loss": 0.87952912, + "num_input_tokens_seen": 31570415, + "step": 1495, + "time_per_iteration": 2.68410587310791 + }, + { + "auxiliary_loss_clip": 0.01150888, + "auxiliary_loss_mlp": 0.0108023, + "balance_loss_clip": 1.05010629, + "balance_loss_mlp": 1.00012159, + "epoch": 0.17988336439608008, + "flos": 68161864815360.0, + "grad_norm": 0.7695477890402301, + "language_loss": 0.5729565, + "learning_rate": 3.769070712807476e-06, + "loss": 0.59526771, + "num_input_tokens_seen": 31632445, + "step": 1496, + "time_per_iteration": 4.292314291000366 + }, + { + "auxiliary_loss_clip": 0.01114831, + "auxiliary_loss_mlp": 0.0109257, + "balance_loss_clip": 1.03668916, + "balance_loss_mlp": 1.01007748, + "epoch": 0.18000360728671919, + "flos": 21945047143680.0, + "grad_norm": 1.8747990223878215, + "language_loss": 0.78994775, + "learning_rate": 3.768707209272266e-06, + "loss": 0.81202173, + "num_input_tokens_seen": 31652575, + "step": 1497, + "time_per_iteration": 2.8733367919921875 + }, + { + "auxiliary_loss_clip": 0.01154228, + "auxiliary_loss_mlp": 0.01090035, + "balance_loss_clip": 1.04171491, + "balance_loss_mlp": 1.00735116, + "epoch": 0.18012385017735827, + "flos": 18986330937600.0, + "grad_norm": 2.5003713096344047, + "language_loss": 0.76594102, + "learning_rate": 3.768343437425579e-06, + "loss": 0.78838366, + "num_input_tokens_seen": 31671145, + "step": 1498, + "time_per_iteration": 2.8205349445343018 + }, + { + "auxiliary_loss_clip": 0.01103704, + "auxiliary_loss_mlp": 0.01089128, + "balance_loss_clip": 1.03007078, + "balance_loss_mlp": 1.0065397, + "epoch": 0.18024409306799735, + "flos": 19748103598080.0, + "grad_norm": 2.2215787527347466, + "language_loss": 0.86007255, + "learning_rate": 3.7679793973225987e-06, + "loss": 0.88200086, + "num_input_tokens_seen": 31686955, + "step": 1499, + "time_per_iteration": 2.9154586791992188 + }, + { + "auxiliary_loss_clip": 0.01130188, + "auxiliary_loss_mlp": 0.0108013, + "balance_loss_clip": 1.04549575, + "balance_loss_mlp": 1.00002182, + "epoch": 0.18036433595863643, + "flos": 67227183060480.0, + "grad_norm": 0.848237360168443, + "language_loss": 0.61641645, + "learning_rate": 3.767615089018549e-06, + "loss": 0.63851964, + "num_input_tokens_seen": 31749300, + "step": 1500, + "time_per_iteration": 3.3919994831085205 + }, + { + "auxiliary_loss_clip": 0.01154902, + "auxiliary_loss_mlp": 0.01092337, + "balance_loss_clip": 1.04251492, + "balance_loss_mlp": 1.00970101, + "epoch": 0.18048457884927555, + "flos": 18181464935040.0, + "grad_norm": 1.9708227934659202, + "language_loss": 0.86496782, + "learning_rate": 3.7672505125686966e-06, + "loss": 0.8874402, + "num_input_tokens_seen": 31765665, + "step": 1501, + "time_per_iteration": 2.8624982833862305 + }, + { + "auxiliary_loss_clip": 0.01133681, + "auxiliary_loss_mlp": 0.01090752, + "balance_loss_clip": 1.03911936, + "balance_loss_mlp": 1.00825953, + "epoch": 0.18060482173991463, + "flos": 15813767111040.0, + "grad_norm": 3.4613083237442788, + "language_loss": 0.83955997, + "learning_rate": 3.7668856680283455e-06, + "loss": 0.86180437, + "num_input_tokens_seen": 31782690, + "step": 1502, + "time_per_iteration": 2.9138920307159424 + }, + { + "auxiliary_loss_clip": 0.0115411, + "auxiliary_loss_mlp": 0.01089812, + "balance_loss_clip": 1.04246497, + "balance_loss_mlp": 1.0071758, + "epoch": 0.1807250646305537, + "flos": 18587399512320.0, + "grad_norm": 2.1184216795390256, + "language_loss": 0.82493615, + "learning_rate": 3.7665205554528437e-06, + "loss": 0.84737539, + "num_input_tokens_seen": 31802045, + "step": 1503, + "time_per_iteration": 2.819767951965332 + }, + { + "auxiliary_loss_clip": 0.01151302, + "auxiliary_loss_mlp": 0.01090174, + "balance_loss_clip": 1.0405972, + "balance_loss_mlp": 1.0075376, + "epoch": 0.18084530752119282, + "flos": 23149131880320.0, + "grad_norm": 2.0070138852889574, + "language_loss": 0.74390173, + "learning_rate": 3.7661551748975782e-06, + "loss": 0.76631641, + "num_input_tokens_seen": 31820220, + "step": 1504, + "time_per_iteration": 2.7474231719970703 + }, + { + "auxiliary_loss_clip": 0.01147805, + "auxiliary_loss_mlp": 0.01080699, + "balance_loss_clip": 1.04636478, + "balance_loss_mlp": 1.00059056, + "epoch": 0.1809655504118319, + "flos": 59803153568640.0, + "grad_norm": 0.8131704991811668, + "language_loss": 0.60476434, + "learning_rate": 3.7657895264179772e-06, + "loss": 0.62704939, + "num_input_tokens_seen": 31876195, + "step": 1505, + "time_per_iteration": 3.276217222213745 + }, + { + "auxiliary_loss_clip": 0.01151303, + "auxiliary_loss_mlp": 0.01090709, + "balance_loss_clip": 1.04160786, + "balance_loss_mlp": 1.0082159, + "epoch": 0.181085793302471, + "flos": 44201941188480.0, + "grad_norm": 1.875550734980915, + "language_loss": 0.74330348, + "learning_rate": 3.765423610069509e-06, + "loss": 0.76572359, + "num_input_tokens_seen": 31901585, + "step": 1506, + "time_per_iteration": 2.9731404781341553 + }, + { + "auxiliary_loss_clip": 0.01151847, + "auxiliary_loss_mlp": 0.01090595, + "balance_loss_clip": 1.04040229, + "balance_loss_mlp": 1.00824547, + "epoch": 0.18120603619311007, + "flos": 34898384638080.0, + "grad_norm": 1.7620705935779932, + "language_loss": 0.72281885, + "learning_rate": 3.765057425907683e-06, + "loss": 0.74524331, + "num_input_tokens_seen": 31923045, + "step": 1507, + "time_per_iteration": 2.8457674980163574 + }, + { + "auxiliary_loss_clip": 0.0116353, + "auxiliary_loss_mlp": 0.01090654, + "balance_loss_clip": 1.04205883, + "balance_loss_mlp": 1.00811315, + "epoch": 0.18132627908374918, + "flos": 21506757390720.0, + "grad_norm": 2.6771031737699955, + "language_loss": 0.78248668, + "learning_rate": 3.764690973988048e-06, + "loss": 0.80502856, + "num_input_tokens_seen": 31943385, + "step": 1508, + "time_per_iteration": 2.755812406539917 + }, + { + "auxiliary_loss_clip": 0.01142954, + "auxiliary_loss_mlp": 0.01091247, + "balance_loss_clip": 1.04120195, + "balance_loss_mlp": 1.0088501, + "epoch": 0.18144652197438826, + "flos": 29057693633280.0, + "grad_norm": 1.9658891173718351, + "language_loss": 0.73399758, + "learning_rate": 3.7643242543661967e-06, + "loss": 0.75633967, + "num_input_tokens_seen": 31966045, + "step": 1509, + "time_per_iteration": 2.8429622650146484 + }, + { + "auxiliary_loss_clip": 0.01142639, + "auxiliary_loss_mlp": 0.01080613, + "balance_loss_clip": 1.04315543, + "balance_loss_mlp": 1.00050414, + "epoch": 0.18156676486502735, + "flos": 68675064382080.0, + "grad_norm": 0.8106552431614741, + "language_loss": 0.60527086, + "learning_rate": 3.7639572670977573e-06, + "loss": 0.6275034, + "num_input_tokens_seen": 32021540, + "step": 1510, + "time_per_iteration": 3.19439959526062 + }, + { + "auxiliary_loss_clip": 0.0114453, + "auxiliary_loss_mlp": 0.01090476, + "balance_loss_clip": 1.04130578, + "balance_loss_mlp": 1.00822115, + "epoch": 0.18168700775566646, + "flos": 26471515334400.0, + "grad_norm": 1.966490535420772, + "language_loss": 0.76111948, + "learning_rate": 3.7635900122384042e-06, + "loss": 0.78346956, + "num_input_tokens_seen": 32044535, + "step": 1511, + "time_per_iteration": 2.8746626377105713 + }, + { + "auxiliary_loss_clip": 0.01146423, + "auxiliary_loss_mlp": 0.01091127, + "balance_loss_clip": 1.04112029, + "balance_loss_mlp": 1.00825286, + "epoch": 0.18180725064630554, + "flos": 15005668884480.0, + "grad_norm": 2.978424997991378, + "language_loss": 0.86668134, + "learning_rate": 3.7632224898438477e-06, + "loss": 0.88905686, + "num_input_tokens_seen": 32061010, + "step": 1512, + "time_per_iteration": 2.698948621749878 + }, + { + "auxiliary_loss_clip": 0.01144121, + "auxiliary_loss_mlp": 0.01090291, + "balance_loss_clip": 1.04015255, + "balance_loss_mlp": 1.00789332, + "epoch": 0.18192749353694462, + "flos": 19682387665920.0, + "grad_norm": 1.6657892770442604, + "language_loss": 0.79172373, + "learning_rate": 3.762854699969842e-06, + "loss": 0.81406784, + "num_input_tokens_seen": 32081520, + "step": 1513, + "time_per_iteration": 3.730436325073242 + }, + { + "auxiliary_loss_clip": 0.01161178, + "auxiliary_loss_mlp": 0.01089683, + "balance_loss_clip": 1.04238701, + "balance_loss_mlp": 1.00752378, + "epoch": 0.1820477364275837, + "flos": 20702717400960.0, + "grad_norm": 1.7747610683349815, + "language_loss": 0.7282325, + "learning_rate": 3.762486642672179e-06, + "loss": 0.75074112, + "num_input_tokens_seen": 32098460, + "step": 1514, + "time_per_iteration": 2.6832752227783203 + }, + { + "auxiliary_loss_clip": 0.01147059, + "auxiliary_loss_mlp": 0.01089653, + "balance_loss_clip": 1.04151618, + "balance_loss_mlp": 1.00711286, + "epoch": 0.18216797931822282, + "flos": 17128708197120.0, + "grad_norm": 1.8804120418923476, + "language_loss": 0.869753, + "learning_rate": 3.7621183180066946e-06, + "loss": 0.89212012, + "num_input_tokens_seen": 32116420, + "step": 1515, + "time_per_iteration": 2.730886697769165 + }, + { + "auxiliary_loss_clip": 0.01156626, + "auxiliary_loss_mlp": 0.0109006, + "balance_loss_clip": 1.04317379, + "balance_loss_mlp": 1.00775766, + "epoch": 0.1822882222088619, + "flos": 29242561956480.0, + "grad_norm": 1.5928343439425625, + "language_loss": 0.73919958, + "learning_rate": 3.7617497260292625e-06, + "loss": 0.76166642, + "num_input_tokens_seen": 32138475, + "step": 1516, + "time_per_iteration": 3.832489013671875 + }, + { + "auxiliary_loss_clip": 0.01143759, + "auxiliary_loss_mlp": 0.01092434, + "balance_loss_clip": 1.03599501, + "balance_loss_mlp": 1.0097506, + "epoch": 0.18240846509950098, + "flos": 17702739446400.0, + "grad_norm": 2.781622378960716, + "language_loss": 0.79077405, + "learning_rate": 3.7613808667957967e-06, + "loss": 0.81313598, + "num_input_tokens_seen": 32151165, + "step": 1517, + "time_per_iteration": 2.624293565750122 + }, + { + "auxiliary_loss_clip": 0.01151689, + "auxiliary_loss_mlp": 0.0108996, + "balance_loss_clip": 1.04036868, + "balance_loss_mlp": 1.0072763, + "epoch": 0.1825287079901401, + "flos": 14790025584000.0, + "grad_norm": 2.208603584531137, + "language_loss": 0.90893161, + "learning_rate": 3.7610117403622547e-06, + "loss": 0.9313482, + "num_input_tokens_seen": 32167725, + "step": 1518, + "time_per_iteration": 2.7713875770568848 + }, + { + "auxiliary_loss_clip": 0.0114361, + "auxiliary_loss_mlp": 0.0109091, + "balance_loss_clip": 1.04012418, + "balance_loss_mlp": 1.00836992, + "epoch": 0.18264895088077918, + "flos": 21946232292480.0, + "grad_norm": 1.7426475929344258, + "language_loss": 0.89887118, + "learning_rate": 3.7606423467846313e-06, + "loss": 0.92121649, + "num_input_tokens_seen": 32187330, + "step": 1519, + "time_per_iteration": 3.7700035572052 + }, + { + "auxiliary_loss_clip": 0.01144298, + "auxiliary_loss_mlp": 0.01092157, + "balance_loss_clip": 1.04173493, + "balance_loss_mlp": 1.00961649, + "epoch": 0.18276919377141826, + "flos": 20886759711360.0, + "grad_norm": 1.568996065832073, + "language_loss": 0.79407418, + "learning_rate": 3.760272686118964e-06, + "loss": 0.81643867, + "num_input_tokens_seen": 32205550, + "step": 1520, + "time_per_iteration": 2.7798879146575928 + }, + { + "auxiliary_loss_clip": 0.01151862, + "auxiliary_loss_mlp": 0.01090585, + "balance_loss_clip": 1.04025996, + "balance_loss_mlp": 1.00804424, + "epoch": 0.18288943666205737, + "flos": 21469877101440.0, + "grad_norm": 2.4836788818653117, + "language_loss": 0.92911273, + "learning_rate": 3.7599027584213297e-06, + "loss": 0.95153719, + "num_input_tokens_seen": 32224430, + "step": 1521, + "time_per_iteration": 3.7454240322113037 + }, + { + "auxiliary_loss_clip": 0.01164527, + "auxiliary_loss_mlp": 0.01091352, + "balance_loss_clip": 1.0433197, + "balance_loss_mlp": 1.00881147, + "epoch": 0.18300967955269645, + "flos": 21539363961600.0, + "grad_norm": 2.4529539341354423, + "language_loss": 0.77753174, + "learning_rate": 3.7595325637478465e-06, + "loss": 0.80009055, + "num_input_tokens_seen": 32242455, + "step": 1522, + "time_per_iteration": 2.664008378982544 + }, + { + "auxiliary_loss_clip": 0.01149057, + "auxiliary_loss_mlp": 0.01090875, + "balance_loss_clip": 1.04375172, + "balance_loss_mlp": 1.00838232, + "epoch": 0.18312992244333554, + "flos": 28876237102080.0, + "grad_norm": 1.8274806927068485, + "language_loss": 0.81551898, + "learning_rate": 3.7591621021546723e-06, + "loss": 0.83791828, + "num_input_tokens_seen": 32264450, + "step": 1523, + "time_per_iteration": 2.820629358291626 + }, + { + "auxiliary_loss_clip": 0.0116408, + "auxiliary_loss_mlp": 0.010912, + "balance_loss_clip": 1.04287028, + "balance_loss_mlp": 1.00861239, + "epoch": 0.18325016533397462, + "flos": 20120102801280.0, + "grad_norm": 1.7014921192062527, + "language_loss": 0.81516254, + "learning_rate": 3.7587913736980062e-06, + "loss": 0.83771533, + "num_input_tokens_seen": 32284090, + "step": 1524, + "time_per_iteration": 2.7287731170654297 + }, + { + "auxiliary_loss_clip": 0.01114565, + "auxiliary_loss_mlp": 0.01090034, + "balance_loss_clip": 1.03727388, + "balance_loss_mlp": 1.00787449, + "epoch": 0.18337040822461373, + "flos": 23329187781120.0, + "grad_norm": 1.7138915071110847, + "language_loss": 0.8463794, + "learning_rate": 3.7584203784340865e-06, + "loss": 0.86842537, + "num_input_tokens_seen": 32303260, + "step": 1525, + "time_per_iteration": 2.908778190612793 + }, + { + "auxiliary_loss_clip": 0.01150808, + "auxiliary_loss_mlp": 0.01088658, + "balance_loss_clip": 1.0404315, + "balance_loss_mlp": 1.00616527, + "epoch": 0.1834906511152528, + "flos": 25009555881600.0, + "grad_norm": 1.971546389967948, + "language_loss": 0.8570413, + "learning_rate": 3.7580491164191938e-06, + "loss": 0.87943596, + "num_input_tokens_seen": 32321570, + "step": 1526, + "time_per_iteration": 2.8504128456115723 + }, + { + "auxiliary_loss_clip": 0.01153562, + "auxiliary_loss_mlp": 0.01081009, + "balance_loss_clip": 1.0448873, + "balance_loss_mlp": 1.00090015, + "epoch": 0.1836108940058919, + "flos": 67251493589760.0, + "grad_norm": 0.7476832839925769, + "language_loss": 0.61308497, + "learning_rate": 3.757677587709648e-06, + "loss": 0.63543069, + "num_input_tokens_seen": 32384835, + "step": 1527, + "time_per_iteration": 3.3859424591064453 + }, + { + "auxiliary_loss_clip": 0.01134942, + "auxiliary_loss_mlp": 0.01090763, + "balance_loss_clip": 1.03910708, + "balance_loss_mlp": 1.00827014, + "epoch": 0.183731136896531, + "flos": 25738721971200.0, + "grad_norm": 1.932011933607104, + "language_loss": 0.75645697, + "learning_rate": 3.7573057923618095e-06, + "loss": 0.77871406, + "num_input_tokens_seen": 32404930, + "step": 1528, + "time_per_iteration": 2.8511886596679688 + }, + { + "auxiliary_loss_clip": 0.01131805, + "auxiliary_loss_mlp": 0.01090502, + "balance_loss_clip": 1.03789902, + "balance_loss_mlp": 1.00791407, + "epoch": 0.1838513797871701, + "flos": 20449403712000.0, + "grad_norm": 3.8566552584735665, + "language_loss": 0.74297249, + "learning_rate": 3.7569337304320793e-06, + "loss": 0.76519561, + "num_input_tokens_seen": 32424515, + "step": 1529, + "time_per_iteration": 2.7820286750793457 + }, + { + "auxiliary_loss_clip": 0.01146196, + "auxiliary_loss_mlp": 0.01080577, + "balance_loss_clip": 1.04581499, + "balance_loss_mlp": 1.00046813, + "epoch": 0.18397162267780917, + "flos": 68565141786240.0, + "grad_norm": 0.839215631901956, + "language_loss": 0.64543265, + "learning_rate": 3.756561401976899e-06, + "loss": 0.66770041, + "num_input_tokens_seen": 32484220, + "step": 1530, + "time_per_iteration": 3.1816205978393555 + }, + { + "auxiliary_loss_clip": 0.0117597, + "auxiliary_loss_mlp": 0.01090613, + "balance_loss_clip": 1.04665172, + "balance_loss_mlp": 1.00807285, + "epoch": 0.18409186556844825, + "flos": 31941104976000.0, + "grad_norm": 1.742567487712328, + "language_loss": 0.82709372, + "learning_rate": 3.7561888070527514e-06, + "loss": 0.84975958, + "num_input_tokens_seen": 32506260, + "step": 1531, + "time_per_iteration": 2.8176708221435547 + }, + { + "auxiliary_loss_clip": 0.01138797, + "auxiliary_loss_mlp": 0.00874445, + "balance_loss_clip": 1.03991914, + "balance_loss_mlp": 1.00026572, + "epoch": 0.18421210845908736, + "flos": 20120533764480.0, + "grad_norm": 2.261510113394184, + "language_loss": 0.79884374, + "learning_rate": 3.7558159457161577e-06, + "loss": 0.81897616, + "num_input_tokens_seen": 32524225, + "step": 1532, + "time_per_iteration": 2.7879600524902344 + }, + { + "auxiliary_loss_clip": 0.01151933, + "auxiliary_loss_mlp": 0.00874366, + "balance_loss_clip": 1.0411911, + "balance_loss_mlp": 1.00024521, + "epoch": 0.18433235134972645, + "flos": 23110491824640.0, + "grad_norm": 3.140343805677135, + "language_loss": 0.77511406, + "learning_rate": 3.755442818023681e-06, + "loss": 0.79537702, + "num_input_tokens_seen": 32543850, + "step": 1533, + "time_per_iteration": 2.7648701667785645 + }, + { + "auxiliary_loss_clip": 0.01142025, + "auxiliary_loss_mlp": 0.01090139, + "balance_loss_clip": 1.04062748, + "balance_loss_mlp": 1.00778961, + "epoch": 0.18445259424036553, + "flos": 18291351617280.0, + "grad_norm": 1.8848479884458154, + "language_loss": 0.75839734, + "learning_rate": 3.7550694240319246e-06, + "loss": 0.78071892, + "num_input_tokens_seen": 32561725, + "step": 1534, + "time_per_iteration": 2.7570300102233887 + }, + { + "auxiliary_loss_clip": 0.01161852, + "auxiliary_loss_mlp": 0.0108798, + "balance_loss_clip": 1.04076648, + "balance_loss_mlp": 1.00567818, + "epoch": 0.18457283713100464, + "flos": 21324079797120.0, + "grad_norm": 2.3322066919400677, + "language_loss": 0.76601374, + "learning_rate": 3.7546957637975326e-06, + "loss": 0.78851205, + "num_input_tokens_seen": 32579135, + "step": 1535, + "time_per_iteration": 2.7434773445129395 + }, + { + "auxiliary_loss_clip": 0.01122928, + "auxiliary_loss_mlp": 0.01091424, + "balance_loss_clip": 1.03711152, + "balance_loss_mlp": 1.00912154, + "epoch": 0.18469308002164372, + "flos": 20375679047040.0, + "grad_norm": 1.4660964751117171, + "language_loss": 0.7392168, + "learning_rate": 3.7543218373771873e-06, + "loss": 0.76136035, + "num_input_tokens_seen": 32598460, + "step": 1536, + "time_per_iteration": 2.8453752994537354 + }, + { + "auxiliary_loss_clip": 0.01125951, + "auxiliary_loss_mlp": 0.00874432, + "balance_loss_clip": 1.03930712, + "balance_loss_mlp": 1.00028038, + "epoch": 0.1848133229122828, + "flos": 26435892021120.0, + "grad_norm": 1.3599005516625509, + "language_loss": 0.78176469, + "learning_rate": 3.753947644827615e-06, + "loss": 0.80176842, + "num_input_tokens_seen": 32621920, + "step": 1537, + "time_per_iteration": 2.949774742126465 + }, + { + "auxiliary_loss_clip": 0.0114542, + "auxiliary_loss_mlp": 0.01080301, + "balance_loss_clip": 1.0450592, + "balance_loss_mlp": 1.00019288, + "epoch": 0.1849335658029219, + "flos": 70547447612160.0, + "grad_norm": 0.9648768538274117, + "language_loss": 0.57225406, + "learning_rate": 3.753573186205579e-06, + "loss": 0.59451139, + "num_input_tokens_seen": 32690040, + "step": 1538, + "time_per_iteration": 3.4175405502319336 + }, + { + "auxiliary_loss_clip": 0.01150742, + "auxiliary_loss_mlp": 0.00874381, + "balance_loss_clip": 1.04019487, + "balance_loss_mlp": 1.00022697, + "epoch": 0.185053808693561, + "flos": 17384140788480.0, + "grad_norm": 2.1273386805109675, + "language_loss": 0.77710098, + "learning_rate": 3.753198461567885e-06, + "loss": 0.79735225, + "num_input_tokens_seen": 32707285, + "step": 1539, + "time_per_iteration": 3.6418814659118652 + }, + { + "auxiliary_loss_clip": 0.01135571, + "auxiliary_loss_mlp": 0.01089663, + "balance_loss_clip": 1.03762543, + "balance_loss_mlp": 1.00759983, + "epoch": 0.18517405158420008, + "flos": 28986159697920.0, + "grad_norm": 2.1428743705400173, + "language_loss": 0.91951054, + "learning_rate": 3.7528234709713783e-06, + "loss": 0.94176292, + "num_input_tokens_seen": 32730030, + "step": 1540, + "time_per_iteration": 2.8550658226013184 + }, + { + "auxiliary_loss_clip": 0.01162199, + "auxiliary_loss_mlp": 0.01089207, + "balance_loss_clip": 1.04187095, + "balance_loss_mlp": 1.00695276, + "epoch": 0.18529429447483917, + "flos": 26794962328320.0, + "grad_norm": 1.7973681251969784, + "language_loss": 0.8444798, + "learning_rate": 3.7524482144729447e-06, + "loss": 0.8669939, + "num_input_tokens_seen": 32749485, + "step": 1541, + "time_per_iteration": 3.6820285320281982 + }, + { + "auxiliary_loss_clip": 0.01146238, + "auxiliary_loss_mlp": 0.01089314, + "balance_loss_clip": 1.04245996, + "balance_loss_mlp": 1.00705981, + "epoch": 0.18541453736547828, + "flos": 13581595301760.0, + "grad_norm": 2.4405948156090465, + "language_loss": 0.83791244, + "learning_rate": 3.7520726921295106e-06, + "loss": 0.86026788, + "num_input_tokens_seen": 32766205, + "step": 1542, + "time_per_iteration": 2.8707025051116943 + }, + { + "auxiliary_loss_clip": 0.01163023, + "auxiliary_loss_mlp": 0.01091108, + "balance_loss_clip": 1.0419035, + "balance_loss_mlp": 1.00866318, + "epoch": 0.18553478025611736, + "flos": 24025424077440.0, + "grad_norm": 1.7056079140133886, + "language_loss": 0.72242862, + "learning_rate": 3.751696903998042e-06, + "loss": 0.7449699, + "num_input_tokens_seen": 32784840, + "step": 1543, + "time_per_iteration": 2.7719531059265137 + }, + { + "auxiliary_loss_clip": 0.01154363, + "auxiliary_loss_mlp": 0.01089741, + "balance_loss_clip": 1.04133821, + "balance_loss_mlp": 1.00758231, + "epoch": 0.18565502314675644, + "flos": 25885165720320.0, + "grad_norm": 1.5329107860192839, + "language_loss": 0.70056969, + "learning_rate": 3.7513208501355456e-06, + "loss": 0.72301072, + "num_input_tokens_seen": 32805945, + "step": 1544, + "time_per_iteration": 3.686382293701172 + }, + { + "auxiliary_loss_clip": 0.01149813, + "auxiliary_loss_mlp": 0.01091677, + "balance_loss_clip": 1.04238081, + "balance_loss_mlp": 1.00951827, + "epoch": 0.18577526603739553, + "flos": 19610063631360.0, + "grad_norm": 1.8767445798240467, + "language_loss": 0.83733606, + "learning_rate": 3.750944530599069e-06, + "loss": 0.85975099, + "num_input_tokens_seen": 32825515, + "step": 1545, + "time_per_iteration": 2.7394440174102783 + }, + { + "auxiliary_loss_clip": 0.01164156, + "auxiliary_loss_mlp": 0.01089163, + "balance_loss_clip": 1.04340625, + "balance_loss_mlp": 1.00681293, + "epoch": 0.18589550892803464, + "flos": 18474891137280.0, + "grad_norm": 1.8644360498788235, + "language_loss": 0.80610085, + "learning_rate": 3.7505679454456992e-06, + "loss": 0.82863408, + "num_input_tokens_seen": 32842125, + "step": 1546, + "time_per_iteration": 2.7210943698883057 + }, + { + "auxiliary_loss_clip": 0.01102163, + "auxiliary_loss_mlp": 0.01090461, + "balance_loss_clip": 1.03455806, + "balance_loss_mlp": 1.0081594, + "epoch": 0.18601575181867372, + "flos": 23549966726400.0, + "grad_norm": 1.855812325541268, + "language_loss": 0.70223743, + "learning_rate": 3.750191094732564e-06, + "loss": 0.72416365, + "num_input_tokens_seen": 32862990, + "step": 1547, + "time_per_iteration": 3.8682806491851807 + }, + { + "auxiliary_loss_clip": 0.01107971, + "auxiliary_loss_mlp": 0.00874428, + "balance_loss_clip": 1.03569388, + "balance_loss_mlp": 1.00025582, + "epoch": 0.1861359947093128, + "flos": 26360192108160.0, + "grad_norm": 1.9097234632768676, + "language_loss": 0.75639969, + "learning_rate": 3.7498139785168313e-06, + "loss": 0.77622366, + "num_input_tokens_seen": 32883595, + "step": 1548, + "time_per_iteration": 2.9600322246551514 + }, + { + "auxiliary_loss_clip": 0.01159657, + "auxiliary_loss_mlp": 0.0108881, + "balance_loss_clip": 1.04116952, + "balance_loss_mlp": 1.00650811, + "epoch": 0.1862562375999519, + "flos": 23331198942720.0, + "grad_norm": 1.6892854696819504, + "language_loss": 0.77435184, + "learning_rate": 3.749436596855709e-06, + "loss": 0.7968365, + "num_input_tokens_seen": 32902895, + "step": 1549, + "time_per_iteration": 2.7239484786987305 + }, + { + "auxiliary_loss_clip": 0.01162394, + "auxiliary_loss_mlp": 0.01089642, + "balance_loss_clip": 1.04216361, + "balance_loss_mlp": 1.00733995, + "epoch": 0.186376480490591, + "flos": 16648222942080.0, + "grad_norm": 1.7681441321327132, + "language_loss": 0.90372688, + "learning_rate": 3.749058949806446e-06, + "loss": 0.92624724, + "num_input_tokens_seen": 32919620, + "step": 1550, + "time_per_iteration": 2.6766982078552246 + }, + { + "auxiliary_loss_clip": 0.01160062, + "auxiliary_loss_mlp": 0.0108862, + "balance_loss_clip": 1.04022551, + "balance_loss_mlp": 1.00631785, + "epoch": 0.18649672338123008, + "flos": 21468656039040.0, + "grad_norm": 1.6637548501779325, + "language_loss": 0.84217691, + "learning_rate": 3.748681037426331e-06, + "loss": 0.86466372, + "num_input_tokens_seen": 32938830, + "step": 1551, + "time_per_iteration": 2.7550759315490723 + }, + { + "auxiliary_loss_clip": 0.01172323, + "auxiliary_loss_mlp": 0.0109116, + "balance_loss_clip": 1.04351842, + "balance_loss_mlp": 1.00885785, + "epoch": 0.1866169662718692, + "flos": 12312728386560.0, + "grad_norm": 2.1210993373616835, + "language_loss": 0.9162972, + "learning_rate": 3.7483028597726936e-06, + "loss": 0.93893206, + "num_input_tokens_seen": 32955600, + "step": 1552, + "time_per_iteration": 2.6143720149993896 + }, + { + "auxiliary_loss_clip": 0.01134955, + "auxiliary_loss_mlp": 0.01090062, + "balance_loss_clip": 1.0403583, + "balance_loss_mlp": 1.00752103, + "epoch": 0.18673720916250827, + "flos": 23581280407680.0, + "grad_norm": 2.3709803242699525, + "language_loss": 0.62812603, + "learning_rate": 3.7479244169029017e-06, + "loss": 0.6503762, + "num_input_tokens_seen": 32975390, + "step": 1553, + "time_per_iteration": 2.904996395111084 + }, + { + "auxiliary_loss_clip": 0.01162428, + "auxiliary_loss_mlp": 0.01089633, + "balance_loss_clip": 1.04190361, + "balance_loss_mlp": 1.00718784, + "epoch": 0.18685745205314735, + "flos": 19718370115200.0, + "grad_norm": 2.2030885737203385, + "language_loss": 0.73434401, + "learning_rate": 3.7475457088743658e-06, + "loss": 0.75686467, + "num_input_tokens_seen": 32992640, + "step": 1554, + "time_per_iteration": 2.6629347801208496 + }, + { + "auxiliary_loss_clip": 0.01151601, + "auxiliary_loss_mlp": 0.01089213, + "balance_loss_clip": 1.04136837, + "balance_loss_mlp": 1.00672054, + "epoch": 0.18697769494378644, + "flos": 34204123589760.0, + "grad_norm": 1.8592710759440905, + "language_loss": 0.74795592, + "learning_rate": 3.7471667357445348e-06, + "loss": 0.77036405, + "num_input_tokens_seen": 33012470, + "step": 1555, + "time_per_iteration": 2.8290929794311523 + }, + { + "auxiliary_loss_clip": 0.01110622, + "auxiliary_loss_mlp": 0.01088844, + "balance_loss_clip": 1.0305047, + "balance_loss_mlp": 1.00649476, + "epoch": 0.18709793783442555, + "flos": 34241327101440.0, + "grad_norm": 1.808049388417665, + "language_loss": 0.72448778, + "learning_rate": 3.7467874975709e-06, + "loss": 0.74648249, + "num_input_tokens_seen": 33033275, + "step": 1556, + "time_per_iteration": 2.915961742401123 + }, + { + "auxiliary_loss_clip": 0.01163008, + "auxiliary_loss_mlp": 0.01090475, + "balance_loss_clip": 1.04366839, + "balance_loss_mlp": 1.00817323, + "epoch": 0.18721818072506463, + "flos": 40734550529280.0, + "grad_norm": 1.9253973608990942, + "language_loss": 0.7847262, + "learning_rate": 3.7464079944109904e-06, + "loss": 0.80726105, + "num_input_tokens_seen": 33055135, + "step": 1557, + "time_per_iteration": 2.8465051651000977 + }, + { + "auxiliary_loss_clip": 0.01134442, + "auxiliary_loss_mlp": 0.0108995, + "balance_loss_clip": 1.03564119, + "balance_loss_mlp": 1.00779057, + "epoch": 0.18733842361570371, + "flos": 22157386392960.0, + "grad_norm": 1.9444322629234163, + "language_loss": 0.77592993, + "learning_rate": 3.746028226322376e-06, + "loss": 0.7981739, + "num_input_tokens_seen": 33071015, + "step": 1558, + "time_per_iteration": 2.8083999156951904 + }, + { + "auxiliary_loss_clip": 0.01151937, + "auxiliary_loss_mlp": 0.0109128, + "balance_loss_clip": 1.04167199, + "balance_loss_mlp": 1.00902593, + "epoch": 0.18745866650634282, + "flos": 18914940656640.0, + "grad_norm": 2.281732666574077, + "language_loss": 0.75222492, + "learning_rate": 3.745648193362669e-06, + "loss": 0.77465713, + "num_input_tokens_seen": 33090370, + "step": 1559, + "time_per_iteration": 2.762437105178833 + }, + { + "auxiliary_loss_clip": 0.0115377, + "auxiliary_loss_mlp": 0.0108877, + "balance_loss_clip": 1.04361212, + "balance_loss_mlp": 1.00675404, + "epoch": 0.1875789093969819, + "flos": 19314626267520.0, + "grad_norm": 2.1306309689068685, + "language_loss": 0.72612274, + "learning_rate": 3.745267895589518e-06, + "loss": 0.74854809, + "num_input_tokens_seen": 33108910, + "step": 1560, + "time_per_iteration": 2.7655434608459473 + }, + { + "auxiliary_loss_clip": 0.01148637, + "auxiliary_loss_mlp": 0.01089336, + "balance_loss_clip": 1.03940451, + "balance_loss_mlp": 1.00708175, + "epoch": 0.187699152287621, + "flos": 17018965169280.0, + "grad_norm": 2.080310607275725, + "language_loss": 0.82354707, + "learning_rate": 3.7448873330606154e-06, + "loss": 0.84592682, + "num_input_tokens_seen": 33126680, + "step": 1561, + "time_per_iteration": 2.6808998584747314 + }, + { + "auxiliary_loss_clip": 0.01140315, + "auxiliary_loss_mlp": 0.01091064, + "balance_loss_clip": 1.0397172, + "balance_loss_mlp": 1.0088098, + "epoch": 0.18781939517826007, + "flos": 22346384780160.0, + "grad_norm": 2.2245422954518626, + "language_loss": 0.87499869, + "learning_rate": 3.7445065058336914e-06, + "loss": 0.89731252, + "num_input_tokens_seen": 33145550, + "step": 1562, + "time_per_iteration": 2.8418972492218018 + }, + { + "auxiliary_loss_clip": 0.01125739, + "auxiliary_loss_mlp": 0.01092998, + "balance_loss_clip": 1.03364372, + "balance_loss_mlp": 1.01069593, + "epoch": 0.18793963806889918, + "flos": 14611478054400.0, + "grad_norm": 1.8325967201407423, + "language_loss": 0.86506689, + "learning_rate": 3.7441254139665176e-06, + "loss": 0.88725424, + "num_input_tokens_seen": 33161735, + "step": 1563, + "time_per_iteration": 2.8245790004730225 + }, + { + "auxiliary_loss_clip": 0.01172824, + "auxiliary_loss_mlp": 0.01090489, + "balance_loss_clip": 1.04490876, + "balance_loss_mlp": 1.00837731, + "epoch": 0.18805988095953827, + "flos": 17457075354240.0, + "grad_norm": 1.9518081142436776, + "language_loss": 0.8233214, + "learning_rate": 3.743744057516905e-06, + "loss": 0.84595454, + "num_input_tokens_seen": 33179795, + "step": 1564, + "time_per_iteration": 3.5391299724578857 + }, + { + "auxiliary_loss_clip": 0.01129469, + "auxiliary_loss_mlp": 0.01089963, + "balance_loss_clip": 1.03764462, + "balance_loss_mlp": 1.00746989, + "epoch": 0.18818012385017735, + "flos": 15043877976960.0, + "grad_norm": 2.9019554396701635, + "language_loss": 0.87878257, + "learning_rate": 3.743362436542706e-06, + "loss": 0.9009769, + "num_input_tokens_seen": 33194485, + "step": 1565, + "time_per_iteration": 2.856875419616699 + }, + { + "auxiliary_loss_clip": 0.01169139, + "auxiliary_loss_mlp": 0.01090823, + "balance_loss_clip": 1.04147196, + "balance_loss_mlp": 1.0085206, + "epoch": 0.18830036674081646, + "flos": 47551975136640.0, + "grad_norm": 1.8377894790678533, + "language_loss": 0.76945269, + "learning_rate": 3.7429805511018115e-06, + "loss": 0.79205227, + "num_input_tokens_seen": 33216145, + "step": 1566, + "time_per_iteration": 2.8519954681396484 + }, + { + "auxiliary_loss_clip": 0.01136635, + "auxiliary_loss_mlp": 0.0087455, + "balance_loss_clip": 1.03713536, + "balance_loss_mlp": 1.00024772, + "epoch": 0.18842060963145554, + "flos": 30044626698240.0, + "grad_norm": 3.7923212749894657, + "language_loss": 0.77713561, + "learning_rate": 3.7425984012521524e-06, + "loss": 0.79724747, + "num_input_tokens_seen": 33236345, + "step": 1567, + "time_per_iteration": 3.792442798614502 + }, + { + "auxiliary_loss_clip": 0.0113429, + "auxiliary_loss_mlp": 0.00873815, + "balance_loss_clip": 1.04344487, + "balance_loss_mlp": 1.00018394, + "epoch": 0.18854085252209463, + "flos": 70318372625280.0, + "grad_norm": 0.7399442708126558, + "language_loss": 0.60444212, + "learning_rate": 3.7422159870517025e-06, + "loss": 0.62452316, + "num_input_tokens_seen": 33301600, + "step": 1568, + "time_per_iteration": 3.3631138801574707 + }, + { + "auxiliary_loss_clip": 0.01153428, + "auxiliary_loss_mlp": 0.01089856, + "balance_loss_clip": 1.04244161, + "balance_loss_mlp": 1.00774479, + "epoch": 0.1886610954127337, + "flos": 21289318410240.0, + "grad_norm": 1.7365438628566319, + "language_loss": 0.78950912, + "learning_rate": 3.7418333085584717e-06, + "loss": 0.81194198, + "num_input_tokens_seen": 33322785, + "step": 1569, + "time_per_iteration": 3.761047840118408 + }, + { + "auxiliary_loss_clip": 0.0113778, + "auxiliary_loss_mlp": 0.01090932, + "balance_loss_clip": 1.04030323, + "balance_loss_mlp": 1.0086298, + "epoch": 0.18878133830337282, + "flos": 17266819991040.0, + "grad_norm": 3.7137923247382147, + "language_loss": 0.90799677, + "learning_rate": 3.7414503658305128e-06, + "loss": 0.9302839, + "num_input_tokens_seen": 33340020, + "step": 1570, + "time_per_iteration": 2.7712557315826416 + }, + { + "auxiliary_loss_clip": 0.01130779, + "auxiliary_loss_mlp": 0.0109001, + "balance_loss_clip": 1.03734231, + "balance_loss_mlp": 1.00766039, + "epoch": 0.1889015811940119, + "flos": 25775207210880.0, + "grad_norm": 2.545101483123202, + "language_loss": 0.77938569, + "learning_rate": 3.7410671589259185e-06, + "loss": 0.80159354, + "num_input_tokens_seen": 33358620, + "step": 1571, + "time_per_iteration": 2.8326010704040527 + }, + { + "auxiliary_loss_clip": 0.01170874, + "auxiliary_loss_mlp": 0.01091069, + "balance_loss_clip": 1.0432806, + "balance_loss_mlp": 1.00876689, + "epoch": 0.18902182408465099, + "flos": 21032197879680.0, + "grad_norm": 2.0424234552619844, + "language_loss": 0.79859227, + "learning_rate": 3.7406836879028205e-06, + "loss": 0.8212117, + "num_input_tokens_seen": 33378845, + "step": 1572, + "time_per_iteration": 3.6683034896850586 + }, + { + "auxiliary_loss_clip": 0.01159201, + "auxiliary_loss_mlp": 0.01090375, + "balance_loss_clip": 1.04137921, + "balance_loss_mlp": 1.00816846, + "epoch": 0.1891420669752901, + "flos": 22272121411200.0, + "grad_norm": 3.31743358323745, + "language_loss": 0.76693857, + "learning_rate": 3.7402999528193907e-06, + "loss": 0.78943431, + "num_input_tokens_seen": 33398345, + "step": 1573, + "time_per_iteration": 2.7727267742156982 + }, + { + "auxiliary_loss_clip": 0.01133618, + "auxiliary_loss_mlp": 0.00874464, + "balance_loss_clip": 1.0361675, + "balance_loss_mlp": 1.00021195, + "epoch": 0.18926230986592918, + "flos": 22017802141440.0, + "grad_norm": 2.3479425551532134, + "language_loss": 0.85436177, + "learning_rate": 3.739915953733842e-06, + "loss": 0.87444258, + "num_input_tokens_seen": 33416390, + "step": 1574, + "time_per_iteration": 2.82126784324646 + }, + { + "auxiliary_loss_clip": 0.01169392, + "auxiliary_loss_mlp": 0.01090318, + "balance_loss_clip": 1.04188728, + "balance_loss_mlp": 1.00801563, + "epoch": 0.18938255275656826, + "flos": 24462672336000.0, + "grad_norm": 1.715107532925818, + "language_loss": 0.81927359, + "learning_rate": 3.7395316907044264e-06, + "loss": 0.84187067, + "num_input_tokens_seen": 33437175, + "step": 1575, + "time_per_iteration": 2.7178561687469482 + }, + { + "auxiliary_loss_clip": 0.01155575, + "auxiliary_loss_mlp": 0.01090388, + "balance_loss_clip": 1.03980696, + "balance_loss_mlp": 1.00789559, + "epoch": 0.18950279564720737, + "flos": 24427049022720.0, + "grad_norm": 1.5130610744949968, + "language_loss": 0.79434443, + "learning_rate": 3.7391471637894364e-06, + "loss": 0.81680399, + "num_input_tokens_seen": 33459440, + "step": 1576, + "time_per_iteration": 2.7428133487701416 + }, + { + "auxiliary_loss_clip": 0.01139668, + "auxiliary_loss_mlp": 0.01090353, + "balance_loss_clip": 1.03823519, + "balance_loss_mlp": 1.00795555, + "epoch": 0.18962303853784646, + "flos": 19756291898880.0, + "grad_norm": 1.8791274925698973, + "language_loss": 0.84516114, + "learning_rate": 3.738762373047205e-06, + "loss": 0.86746132, + "num_input_tokens_seen": 33479360, + "step": 1577, + "time_per_iteration": 2.8148951530456543 + }, + { + "auxiliary_loss_clip": 0.0114082, + "auxiliary_loss_mlp": 0.01089274, + "balance_loss_clip": 1.03969204, + "balance_loss_mlp": 1.00687647, + "epoch": 0.18974328142848554, + "flos": 21032054225280.0, + "grad_norm": 1.6371537459078747, + "language_loss": 0.83318067, + "learning_rate": 3.738377318536103e-06, + "loss": 0.85548162, + "num_input_tokens_seen": 33499245, + "step": 1578, + "time_per_iteration": 2.7777750492095947 + }, + { + "auxiliary_loss_clip": 0.0117083, + "auxiliary_loss_mlp": 0.01089122, + "balance_loss_clip": 1.04387736, + "balance_loss_mlp": 1.00701046, + "epoch": 0.18986352431912462, + "flos": 12966122736000.0, + "grad_norm": 2.1290345141834894, + "language_loss": 0.71457916, + "learning_rate": 3.7379920003145447e-06, + "loss": 0.73717868, + "num_input_tokens_seen": 33513520, + "step": 1579, + "time_per_iteration": 2.6367852687835693 + }, + { + "auxiliary_loss_clip": 0.01149076, + "auxiliary_loss_mlp": 0.01091607, + "balance_loss_clip": 1.04088485, + "balance_loss_mlp": 1.00911379, + "epoch": 0.18998376720976373, + "flos": 23767908497280.0, + "grad_norm": 1.7133493690244879, + "language_loss": 0.83865082, + "learning_rate": 3.7376064184409817e-06, + "loss": 0.86105764, + "num_input_tokens_seen": 33533100, + "step": 1580, + "time_per_iteration": 2.7585864067077637 + }, + { + "auxiliary_loss_clip": 0.01151696, + "auxiliary_loss_mlp": 0.01088762, + "balance_loss_clip": 1.04237556, + "balance_loss_mlp": 1.00660276, + "epoch": 0.19010401010040281, + "flos": 22966023323520.0, + "grad_norm": 1.3830679712995844, + "language_loss": 0.87215018, + "learning_rate": 3.7372205729739063e-06, + "loss": 0.89455473, + "num_input_tokens_seen": 33554915, + "step": 1581, + "time_per_iteration": 2.688687801361084 + }, + { + "auxiliary_loss_clip": 0.0116327, + "auxiliary_loss_mlp": 0.01090633, + "balance_loss_clip": 1.04423296, + "balance_loss_mlp": 1.00833142, + "epoch": 0.1902242529910419, + "flos": 19135647774720.0, + "grad_norm": 2.815557286951332, + "language_loss": 0.71819532, + "learning_rate": 3.7368344639718514e-06, + "loss": 0.74073434, + "num_input_tokens_seen": 33572850, + "step": 1582, + "time_per_iteration": 2.7931065559387207 + }, + { + "auxiliary_loss_clip": 0.01161217, + "auxiliary_loss_mlp": 0.01092968, + "balance_loss_clip": 1.04211664, + "balance_loss_mlp": 1.0107137, + "epoch": 0.190344495881681, + "flos": 25483935824640.0, + "grad_norm": 1.5117458434321918, + "language_loss": 0.80416542, + "learning_rate": 3.7364480914933895e-06, + "loss": 0.82670724, + "num_input_tokens_seen": 33593090, + "step": 1583, + "time_per_iteration": 2.6829848289489746 + }, + { + "auxiliary_loss_clip": 0.01124735, + "auxiliary_loss_mlp": 0.00874393, + "balance_loss_clip": 1.03541803, + "balance_loss_mlp": 1.00018549, + "epoch": 0.1904647387723201, + "flos": 26792843425920.0, + "grad_norm": 1.8046889964194848, + "language_loss": 0.8137207, + "learning_rate": 3.7360614555971325e-06, + "loss": 0.83371198, + "num_input_tokens_seen": 33612745, + "step": 1584, + "time_per_iteration": 2.86482572555542 + }, + { + "auxiliary_loss_clip": 0.01159202, + "auxiliary_loss_mlp": 0.00874383, + "balance_loss_clip": 1.04205704, + "balance_loss_mlp": 1.00014472, + "epoch": 0.19058498166295917, + "flos": 23987753688960.0, + "grad_norm": 2.44628125903833, + "language_loss": 0.85149956, + "learning_rate": 3.735674556341733e-06, + "loss": 0.87183535, + "num_input_tokens_seen": 33632360, + "step": 1585, + "time_per_iteration": 2.688518762588501 + }, + { + "auxiliary_loss_clip": 0.01143733, + "auxiliary_loss_mlp": 0.0108875, + "balance_loss_clip": 1.03932118, + "balance_loss_mlp": 1.00649524, + "epoch": 0.19070522455359826, + "flos": 28293299280000.0, + "grad_norm": 2.25070720248142, + "language_loss": 0.82554501, + "learning_rate": 3.7352873937858835e-06, + "loss": 0.84786987, + "num_input_tokens_seen": 33653895, + "step": 1586, + "time_per_iteration": 2.7834908962249756 + }, + { + "auxiliary_loss_clip": 0.01132087, + "auxiliary_loss_mlp": 0.00874469, + "balance_loss_clip": 1.03426445, + "balance_loss_mlp": 1.00016427, + "epoch": 0.19082546744423737, + "flos": 25660220797440.0, + "grad_norm": 2.2080214999767707, + "language_loss": 0.72068167, + "learning_rate": 3.734899967988316e-06, + "loss": 0.74074727, + "num_input_tokens_seen": 33672075, + "step": 1587, + "time_per_iteration": 2.843848705291748 + }, + { + "auxiliary_loss_clip": 0.01136815, + "auxiliary_loss_mlp": 0.01089091, + "balance_loss_clip": 1.04103351, + "balance_loss_mlp": 1.00707459, + "epoch": 0.19094571033487645, + "flos": 19719483436800.0, + "grad_norm": 1.715046196913916, + "language_loss": 0.83921587, + "learning_rate": 3.7345122790078026e-06, + "loss": 0.86147487, + "num_input_tokens_seen": 33689640, + "step": 1588, + "time_per_iteration": 2.8088083267211914 + }, + { + "auxiliary_loss_clip": 0.01156479, + "auxiliary_loss_mlp": 0.01088667, + "balance_loss_clip": 1.03958845, + "balance_loss_mlp": 1.00641298, + "epoch": 0.19106595322551553, + "flos": 21616320850560.0, + "grad_norm": 2.8135438333929144, + "language_loss": 0.92661732, + "learning_rate": 3.7341243269031556e-06, + "loss": 0.94906878, + "num_input_tokens_seen": 33708630, + "step": 1589, + "time_per_iteration": 3.6880877017974854 + }, + { + "auxiliary_loss_clip": 0.01150678, + "auxiliary_loss_mlp": 0.01089826, + "balance_loss_clip": 1.04243815, + "balance_loss_mlp": 1.00761938, + "epoch": 0.19118619611615464, + "flos": 29896890059520.0, + "grad_norm": 1.612672167354502, + "language_loss": 0.77675623, + "learning_rate": 3.7337361117332275e-06, + "loss": 0.79916126, + "num_input_tokens_seen": 33730370, + "step": 1590, + "time_per_iteration": 2.7950174808502197 + }, + { + "auxiliary_loss_clip": 0.01140808, + "auxiliary_loss_mlp": 0.01088836, + "balance_loss_clip": 1.03944004, + "balance_loss_mlp": 1.00648642, + "epoch": 0.19130643900679373, + "flos": 17273428093440.0, + "grad_norm": 1.8343013691420011, + "language_loss": 0.76955128, + "learning_rate": 3.7333476335569087e-06, + "loss": 0.79184771, + "num_input_tokens_seen": 33748370, + "step": 1591, + "time_per_iteration": 2.801044225692749 + }, + { + "auxiliary_loss_clip": 0.01149661, + "auxiliary_loss_mlp": 0.0109025, + "balance_loss_clip": 1.04138339, + "balance_loss_mlp": 1.00785232, + "epoch": 0.1914266818974328, + "flos": 24826339584000.0, + "grad_norm": 2.872013871255762, + "language_loss": 0.67099488, + "learning_rate": 3.7329588924331325e-06, + "loss": 0.69339401, + "num_input_tokens_seen": 33769575, + "step": 1592, + "time_per_iteration": 2.8073830604553223 + }, + { + "auxiliary_loss_clip": 0.01140303, + "auxiliary_loss_mlp": 0.0108804, + "balance_loss_clip": 1.03994417, + "balance_loss_mlp": 1.00583315, + "epoch": 0.1915469247880719, + "flos": 18952467390720.0, + "grad_norm": 1.907248463211236, + "language_loss": 0.82289284, + "learning_rate": 3.732569888420871e-06, + "loss": 0.84517628, + "num_input_tokens_seen": 33789110, + "step": 1593, + "time_per_iteration": 3.735687017440796 + }, + { + "auxiliary_loss_clip": 0.01167252, + "auxiliary_loss_mlp": 0.01091268, + "balance_loss_clip": 1.04020774, + "balance_loss_mlp": 1.00872755, + "epoch": 0.191667167678711, + "flos": 21032952065280.0, + "grad_norm": 2.0844697295187316, + "language_loss": 0.82361716, + "learning_rate": 3.732180621579134e-06, + "loss": 0.84620237, + "num_input_tokens_seen": 33808325, + "step": 1594, + "time_per_iteration": 2.6717443466186523 + }, + { + "auxiliary_loss_clip": 0.01144532, + "auxiliary_loss_mlp": 0.01093053, + "balance_loss_clip": 1.04234433, + "balance_loss_mlp": 1.01060772, + "epoch": 0.1917874105693501, + "flos": 34237663914240.0, + "grad_norm": 1.93775865915951, + "language_loss": 0.81240487, + "learning_rate": 3.7317910919669745e-06, + "loss": 0.83478075, + "num_input_tokens_seen": 33829520, + "step": 1595, + "time_per_iteration": 3.9041519165039062 + }, + { + "auxiliary_loss_clip": 0.01156862, + "auxiliary_loss_mlp": 0.0108881, + "balance_loss_clip": 1.03991854, + "balance_loss_mlp": 1.00660288, + "epoch": 0.19190765345998917, + "flos": 23550613171200.0, + "grad_norm": 2.292846844960265, + "language_loss": 0.7577275, + "learning_rate": 3.7314012996434826e-06, + "loss": 0.78018421, + "num_input_tokens_seen": 33848250, + "step": 1596, + "time_per_iteration": 2.761258602142334 + }, + { + "auxiliary_loss_clip": 0.01144461, + "auxiliary_loss_mlp": 0.01088644, + "balance_loss_clip": 1.03846467, + "balance_loss_mlp": 1.00643682, + "epoch": 0.19202789635062828, + "flos": 19861330245120.0, + "grad_norm": 1.8998326430221197, + "language_loss": 0.80986941, + "learning_rate": 3.7310112446677907e-06, + "loss": 0.83220041, + "num_input_tokens_seen": 33866160, + "step": 1597, + "time_per_iteration": 3.6743454933166504 + }, + { + "auxiliary_loss_clip": 0.01171132, + "auxiliary_loss_mlp": 0.01089492, + "balance_loss_clip": 1.04408884, + "balance_loss_mlp": 1.0070951, + "epoch": 0.19214813924126736, + "flos": 20922957642240.0, + "grad_norm": 1.9837632066511546, + "language_loss": 0.6948157, + "learning_rate": 3.7306209270990695e-06, + "loss": 0.71742201, + "num_input_tokens_seen": 33884165, + "step": 1598, + "time_per_iteration": 2.6616382598876953 + }, + { + "auxiliary_loss_clip": 0.01144482, + "auxiliary_loss_mlp": 0.01092549, + "balance_loss_clip": 1.04118562, + "balance_loss_mlp": 1.01038969, + "epoch": 0.19226838213190645, + "flos": 26359725231360.0, + "grad_norm": 1.8786948345075833, + "language_loss": 0.86993396, + "learning_rate": 3.7302303469965292e-06, + "loss": 0.8923043, + "num_input_tokens_seen": 33903705, + "step": 1599, + "time_per_iteration": 2.8110718727111816 + }, + { + "auxiliary_loss_clip": 0.01158596, + "auxiliary_loss_mlp": 0.01090014, + "balance_loss_clip": 1.04157138, + "balance_loss_mlp": 1.00761676, + "epoch": 0.19238862502254553, + "flos": 20850525866880.0, + "grad_norm": 1.9412215054548398, + "language_loss": 0.70694917, + "learning_rate": 3.7298395044194206e-06, + "loss": 0.72943521, + "num_input_tokens_seen": 33922515, + "step": 1600, + "time_per_iteration": 2.721388578414917 + }, + { + "auxiliary_loss_clip": 0.01170525, + "auxiliary_loss_mlp": 0.01089568, + "balance_loss_clip": 1.04368854, + "balance_loss_mlp": 1.00755227, + "epoch": 0.19250886791318464, + "flos": 21726063878400.0, + "grad_norm": 1.90155869679312, + "language_loss": 0.94239426, + "learning_rate": 3.7294483994270356e-06, + "loss": 0.96499515, + "num_input_tokens_seen": 33940840, + "step": 1601, + "time_per_iteration": 2.7025349140167236 + }, + { + "auxiliary_loss_clip": 0.01124724, + "auxiliary_loss_mlp": 0.01088382, + "balance_loss_clip": 1.03704786, + "balance_loss_mlp": 1.00622296, + "epoch": 0.19262911080382372, + "flos": 23367827836800.0, + "grad_norm": 2.1628923649958116, + "language_loss": 0.77987278, + "learning_rate": 3.7290570320787033e-06, + "loss": 0.80200374, + "num_input_tokens_seen": 33960420, + "step": 1602, + "time_per_iteration": 2.9101903438568115 + }, + { + "auxiliary_loss_clip": 0.01151315, + "auxiliary_loss_mlp": 0.01089266, + "balance_loss_clip": 1.03601408, + "balance_loss_mlp": 1.0070591, + "epoch": 0.1927493536944628, + "flos": 21943502858880.0, + "grad_norm": 1.9858157396760512, + "language_loss": 0.7104212, + "learning_rate": 3.728665402433793e-06, + "loss": 0.73282701, + "num_input_tokens_seen": 33978990, + "step": 1603, + "time_per_iteration": 2.7000982761383057 + }, + { + "auxiliary_loss_clip": 0.01142935, + "auxiliary_loss_mlp": 0.01091417, + "balance_loss_clip": 1.03643847, + "balance_loss_mlp": 1.00925803, + "epoch": 0.19286959658510192, + "flos": 16545590807040.0, + "grad_norm": 2.243261979038915, + "language_loss": 0.86265862, + "learning_rate": 3.7282735105517164e-06, + "loss": 0.88500214, + "num_input_tokens_seen": 33997115, + "step": 1604, + "time_per_iteration": 2.758035659790039 + }, + { + "auxiliary_loss_clip": 0.01131001, + "auxiliary_loss_mlp": 0.0108883, + "balance_loss_clip": 1.03755057, + "balance_loss_mlp": 1.00652814, + "epoch": 0.192989839475741, + "flos": 21616967295360.0, + "grad_norm": 2.2129482338081288, + "language_loss": 0.67319953, + "learning_rate": 3.727881356491922e-06, + "loss": 0.69539785, + "num_input_tokens_seen": 34015525, + "step": 1605, + "time_per_iteration": 2.79862380027771 + }, + { + "auxiliary_loss_clip": 0.01170654, + "auxiliary_loss_mlp": 0.01090277, + "balance_loss_clip": 1.0440259, + "balance_loss_mlp": 1.00835657, + "epoch": 0.19311008236638008, + "flos": 19281516906240.0, + "grad_norm": 1.9460693981066683, + "language_loss": 0.7620576, + "learning_rate": 3.7274889403139002e-06, + "loss": 0.78466702, + "num_input_tokens_seen": 34033150, + "step": 1606, + "time_per_iteration": 2.689082145690918 + }, + { + "auxiliary_loss_clip": 0.01124868, + "auxiliary_loss_mlp": 0.01090333, + "balance_loss_clip": 1.0361073, + "balance_loss_mlp": 1.00822127, + "epoch": 0.1932303252570192, + "flos": 28652369587200.0, + "grad_norm": 2.3490298437205395, + "language_loss": 0.7814458, + "learning_rate": 3.727096262077179e-06, + "loss": 0.80359781, + "num_input_tokens_seen": 34052145, + "step": 1607, + "time_per_iteration": 2.9655792713165283 + }, + { + "auxiliary_loss_clip": 0.01158497, + "auxiliary_loss_mlp": 0.01091119, + "balance_loss_clip": 1.04078758, + "balance_loss_mlp": 1.00896037, + "epoch": 0.19335056814765827, + "flos": 18368990864640.0, + "grad_norm": 2.0140978294055634, + "language_loss": 0.85381234, + "learning_rate": 3.7267033218413285e-06, + "loss": 0.87630856, + "num_input_tokens_seen": 34069940, + "step": 1608, + "time_per_iteration": 2.7249512672424316 + }, + { + "auxiliary_loss_clip": 0.01122563, + "auxiliary_loss_mlp": 0.01091445, + "balance_loss_clip": 1.03791285, + "balance_loss_mlp": 1.00895262, + "epoch": 0.19347081103829736, + "flos": 13260877741440.0, + "grad_norm": 1.9660202621631218, + "language_loss": 0.81321943, + "learning_rate": 3.726310119665957e-06, + "loss": 0.83535957, + "num_input_tokens_seen": 34086275, + "step": 1609, + "time_per_iteration": 2.8453214168548584 + }, + { + "auxiliary_loss_clip": 0.01156563, + "auxiliary_loss_mlp": 0.0108918, + "balance_loss_clip": 1.03954673, + "balance_loss_mlp": 1.00683045, + "epoch": 0.19359105392893644, + "flos": 20300122788480.0, + "grad_norm": 1.8401704412097617, + "language_loss": 0.85234261, + "learning_rate": 3.725916655610713e-06, + "loss": 0.87480003, + "num_input_tokens_seen": 34105605, + "step": 1610, + "time_per_iteration": 2.7271811962127686 + }, + { + "auxiliary_loss_clip": 0.01143186, + "auxiliary_loss_mlp": 0.01089062, + "balance_loss_clip": 1.03980315, + "balance_loss_mlp": 1.00675988, + "epoch": 0.19371129681957555, + "flos": 20484596062080.0, + "grad_norm": 4.854990983942845, + "language_loss": 0.75516963, + "learning_rate": 3.725522929735284e-06, + "loss": 0.77749217, + "num_input_tokens_seen": 34122540, + "step": 1611, + "time_per_iteration": 2.8003289699554443 + }, + { + "auxiliary_loss_clip": 0.01151637, + "auxiliary_loss_mlp": 0.01087178, + "balance_loss_clip": 1.0405457, + "balance_loss_mlp": 1.00487614, + "epoch": 0.19383153971021463, + "flos": 30445497457920.0, + "grad_norm": 1.9894025626755885, + "language_loss": 0.7411359, + "learning_rate": 3.725128942099399e-06, + "loss": 0.76352406, + "num_input_tokens_seen": 34142940, + "step": 1612, + "time_per_iteration": 2.792269229888916 + }, + { + "auxiliary_loss_clip": 0.01147887, + "auxiliary_loss_mlp": 0.01089124, + "balance_loss_clip": 1.03931952, + "balance_loss_mlp": 1.00696492, + "epoch": 0.19395178260085372, + "flos": 24569937325440.0, + "grad_norm": 1.7939466392338423, + "language_loss": 0.79978311, + "learning_rate": 3.7247346927628245e-06, + "loss": 0.82215321, + "num_input_tokens_seen": 34162875, + "step": 1613, + "time_per_iteration": 3.806833028793335 + }, + { + "auxiliary_loss_clip": 0.01137535, + "auxiliary_loss_mlp": 0.00874302, + "balance_loss_clip": 1.03736138, + "balance_loss_mlp": 1.00008106, + "epoch": 0.19407202549149283, + "flos": 28950608211840.0, + "grad_norm": 2.051058428305877, + "language_loss": 0.7990672, + "learning_rate": 3.7243401817853694e-06, + "loss": 0.81918555, + "num_input_tokens_seen": 34183565, + "step": 1614, + "time_per_iteration": 2.8207054138183594 + }, + { + "auxiliary_loss_clip": 0.01162089, + "auxiliary_loss_mlp": 0.01091481, + "balance_loss_clip": 1.0434444, + "balance_loss_mlp": 1.00936985, + "epoch": 0.1941922683821319, + "flos": 18004497603840.0, + "grad_norm": 1.9482704179357604, + "language_loss": 0.71838808, + "learning_rate": 3.723945409226879e-06, + "loss": 0.74092382, + "num_input_tokens_seen": 34202055, + "step": 1615, + "time_per_iteration": 2.7370476722717285 + }, + { + "auxiliary_loss_clip": 0.01160042, + "auxiliary_loss_mlp": 0.01088621, + "balance_loss_clip": 1.04157877, + "balance_loss_mlp": 1.00650918, + "epoch": 0.194312511272771, + "flos": 9720337034880.0, + "grad_norm": 2.273215962289981, + "language_loss": 0.80138707, + "learning_rate": 3.723550375147241e-06, + "loss": 0.82387364, + "num_input_tokens_seen": 34216830, + "step": 1616, + "time_per_iteration": 2.7234437465667725 + }, + { + "auxiliary_loss_clip": 0.01131873, + "auxiliary_loss_mlp": 0.0109037, + "balance_loss_clip": 1.03822923, + "balance_loss_mlp": 1.00821114, + "epoch": 0.19443275416341008, + "flos": 27016208150400.0, + "grad_norm": 1.8340919609864645, + "language_loss": 0.79926634, + "learning_rate": 3.7231550796063816e-06, + "loss": 0.82148874, + "num_input_tokens_seen": 34236840, + "step": 1617, + "time_per_iteration": 2.8270089626312256 + }, + { + "auxiliary_loss_clip": 0.0114742, + "auxiliary_loss_mlp": 0.01093183, + "balance_loss_clip": 1.03918529, + "balance_loss_mlp": 1.01078606, + "epoch": 0.1945529970540492, + "flos": 15846625077120.0, + "grad_norm": 1.7075955631311408, + "language_loss": 0.64864135, + "learning_rate": 3.722759522664266e-06, + "loss": 0.67104739, + "num_input_tokens_seen": 34254140, + "step": 1618, + "time_per_iteration": 3.7149856090545654 + }, + { + "auxiliary_loss_clip": 0.01125407, + "auxiliary_loss_mlp": 0.01090706, + "balance_loss_clip": 1.03693449, + "balance_loss_mlp": 1.00845146, + "epoch": 0.19467323994468827, + "flos": 19314985403520.0, + "grad_norm": 1.7098545018076152, + "language_loss": 0.81623232, + "learning_rate": 3.7223637043809016e-06, + "loss": 0.83839345, + "num_input_tokens_seen": 34273120, + "step": 1619, + "time_per_iteration": 2.752014636993408 + }, + { + "auxiliary_loss_clip": 0.01133629, + "auxiliary_loss_mlp": 0.01089858, + "balance_loss_clip": 1.03958249, + "balance_loss_mlp": 1.00765181, + "epoch": 0.19479348283532735, + "flos": 24133227770880.0, + "grad_norm": 2.2190221563360577, + "language_loss": 0.86311591, + "learning_rate": 3.7219676248163322e-06, + "loss": 0.88535082, + "num_input_tokens_seen": 34290285, + "step": 1620, + "time_per_iteration": 2.8216938972473145 + }, + { + "auxiliary_loss_clip": 0.01158395, + "auxiliary_loss_mlp": 0.01089709, + "balance_loss_clip": 1.04016924, + "balance_loss_mlp": 1.00745511, + "epoch": 0.19491372572596646, + "flos": 25775638174080.0, + "grad_norm": 1.850776213248583, + "language_loss": 0.93271554, + "learning_rate": 3.721571284030643e-06, + "loss": 0.95519656, + "num_input_tokens_seen": 34310095, + "step": 1621, + "time_per_iteration": 3.7233633995056152 + }, + { + "auxiliary_loss_clip": 0.01159302, + "auxiliary_loss_mlp": 0.01090042, + "balance_loss_clip": 1.04123974, + "balance_loss_mlp": 1.0079304, + "epoch": 0.19503396861660555, + "flos": 19645220067840.0, + "grad_norm": 2.312153651260234, + "language_loss": 0.79279786, + "learning_rate": 3.7211746820839587e-06, + "loss": 0.81529135, + "num_input_tokens_seen": 34327190, + "step": 1622, + "time_per_iteration": 2.6736717224121094 + }, + { + "auxiliary_loss_clip": 0.01102332, + "auxiliary_loss_mlp": 0.01089016, + "balance_loss_clip": 1.03312719, + "balance_loss_mlp": 1.00690472, + "epoch": 0.19515421150724463, + "flos": 21033023892480.0, + "grad_norm": 1.5713396404686215, + "language_loss": 0.80511367, + "learning_rate": 3.7207778190364437e-06, + "loss": 0.82702714, + "num_input_tokens_seen": 34345615, + "step": 1623, + "time_per_iteration": 3.863142967224121 + }, + { + "auxiliary_loss_clip": 0.0112099, + "auxiliary_loss_mlp": 0.01088088, + "balance_loss_clip": 1.03820634, + "balance_loss_mlp": 1.00611949, + "epoch": 0.1952744543978837, + "flos": 32961255143040.0, + "grad_norm": 2.145243703132097, + "language_loss": 0.74171233, + "learning_rate": 3.720380694948302e-06, + "loss": 0.76380312, + "num_input_tokens_seen": 34368500, + "step": 1624, + "time_per_iteration": 2.980336904525757 + }, + { + "auxiliary_loss_clip": 0.0111976, + "auxiliary_loss_mlp": 0.01080502, + "balance_loss_clip": 1.03797472, + "balance_loss_mlp": 1.00039315, + "epoch": 0.19539469728852282, + "flos": 64044312030720.0, + "grad_norm": 1.0368179202873702, + "language_loss": 0.71276188, + "learning_rate": 3.719983309879777e-06, + "loss": 0.73476446, + "num_input_tokens_seen": 34428280, + "step": 1625, + "time_per_iteration": 3.406303882598877 + }, + { + "auxiliary_loss_clip": 0.01139166, + "auxiliary_loss_mlp": 0.01089519, + "balance_loss_clip": 1.03958499, + "balance_loss_mlp": 1.00731242, + "epoch": 0.1955149401791619, + "flos": 13370908078080.0, + "grad_norm": 1.5827171550775079, + "language_loss": 0.77574277, + "learning_rate": 3.719585663891151e-06, + "loss": 0.79802966, + "num_input_tokens_seen": 34445815, + "step": 1626, + "time_per_iteration": 2.7784464359283447 + }, + { + "auxiliary_loss_clip": 0.01124312, + "auxiliary_loss_mlp": 0.01092462, + "balance_loss_clip": 1.03597593, + "balance_loss_mlp": 1.01006413, + "epoch": 0.195635183069801, + "flos": 18728887184640.0, + "grad_norm": 2.297344383127018, + "language_loss": 0.78812551, + "learning_rate": 3.719187757042747e-06, + "loss": 0.81029332, + "num_input_tokens_seen": 34463635, + "step": 1627, + "time_per_iteration": 2.8610317707061768 + }, + { + "auxiliary_loss_clip": 0.01137643, + "auxiliary_loss_mlp": 0.01080791, + "balance_loss_clip": 1.03993773, + "balance_loss_mlp": 1.00068212, + "epoch": 0.1957554259604401, + "flos": 69313952615040.0, + "grad_norm": 0.7265233683465028, + "language_loss": 0.54970968, + "learning_rate": 3.7187895893949275e-06, + "loss": 0.57189405, + "num_input_tokens_seen": 34530105, + "step": 1628, + "time_per_iteration": 3.4200968742370605 + }, + { + "auxiliary_loss_clip": 0.01123438, + "auxiliary_loss_mlp": 0.01089737, + "balance_loss_clip": 1.03798127, + "balance_loss_mlp": 1.00738752, + "epoch": 0.19587566885107918, + "flos": 21069257736960.0, + "grad_norm": 2.441792182956617, + "language_loss": 0.76164079, + "learning_rate": 3.7183911610080937e-06, + "loss": 0.78377253, + "num_input_tokens_seen": 34546970, + "step": 1629, + "time_per_iteration": 2.879150629043579 + }, + { + "auxiliary_loss_clip": 0.01138453, + "auxiliary_loss_mlp": 0.01089928, + "balance_loss_clip": 1.03980637, + "balance_loss_mlp": 1.00748277, + "epoch": 0.19599591174171827, + "flos": 22194661731840.0, + "grad_norm": 2.2592371556576394, + "language_loss": 0.74920261, + "learning_rate": 3.7179924719426872e-06, + "loss": 0.7714864, + "num_input_tokens_seen": 34564865, + "step": 1630, + "time_per_iteration": 2.7790427207946777 + }, + { + "auxiliary_loss_clip": 0.01157571, + "auxiliary_loss_mlp": 0.01089874, + "balance_loss_clip": 1.04084539, + "balance_loss_mlp": 1.00738096, + "epoch": 0.19611615463235738, + "flos": 23768375374080.0, + "grad_norm": 2.6707539929370716, + "language_loss": 0.75722277, + "learning_rate": 3.7175935222591885e-06, + "loss": 0.77969718, + "num_input_tokens_seen": 34584165, + "step": 1631, + "time_per_iteration": 2.7539169788360596 + }, + { + "auxiliary_loss_clip": 0.0114902, + "auxiliary_loss_mlp": 0.01091147, + "balance_loss_clip": 1.04108036, + "balance_loss_mlp": 1.00894022, + "epoch": 0.19623639752299646, + "flos": 28618218731520.0, + "grad_norm": 1.872779222713454, + "language_loss": 0.74343157, + "learning_rate": 3.717194312018118e-06, + "loss": 0.76583326, + "num_input_tokens_seen": 34603150, + "step": 1632, + "time_per_iteration": 2.802842378616333 + }, + { + "auxiliary_loss_clip": 0.01156131, + "auxiliary_loss_mlp": 0.01090461, + "balance_loss_clip": 1.03939021, + "balance_loss_mlp": 1.0079205, + "epoch": 0.19635664041363554, + "flos": 21032700670080.0, + "grad_norm": 1.8102729052245514, + "language_loss": 0.7575165, + "learning_rate": 3.716794841280036e-06, + "loss": 0.77998245, + "num_input_tokens_seen": 34621855, + "step": 1633, + "time_per_iteration": 2.7277066707611084 + }, + { + "auxiliary_loss_clip": 0.01161726, + "auxiliary_loss_mlp": 0.01088759, + "balance_loss_clip": 1.04290664, + "balance_loss_mlp": 1.00636148, + "epoch": 0.19647688330427462, + "flos": 18879748306560.0, + "grad_norm": 2.3980060508696295, + "language_loss": 0.77442968, + "learning_rate": 3.7163951101055407e-06, + "loss": 0.79693455, + "num_input_tokens_seen": 34639915, + "step": 1634, + "time_per_iteration": 2.6595540046691895 + }, + { + "auxiliary_loss_clip": 0.01151395, + "auxiliary_loss_mlp": 0.01091418, + "balance_loss_clip": 1.04194057, + "balance_loss_mlp": 1.00916398, + "epoch": 0.19659712619491373, + "flos": 24242503921920.0, + "grad_norm": 2.4661590693683806, + "language_loss": 0.79134536, + "learning_rate": 3.715995118555273e-06, + "loss": 0.81377351, + "num_input_tokens_seen": 34659890, + "step": 1635, + "time_per_iteration": 2.8487632274627686 + }, + { + "auxiliary_loss_clip": 0.01129353, + "auxiliary_loss_mlp": 0.01090633, + "balance_loss_clip": 1.04015708, + "balance_loss_mlp": 1.00804496, + "epoch": 0.19671736908555282, + "flos": 24717422568960.0, + "grad_norm": 2.0259578952684656, + "language_loss": 0.85659307, + "learning_rate": 3.71559486668991e-06, + "loss": 0.87879294, + "num_input_tokens_seen": 34678750, + "step": 1636, + "time_per_iteration": 2.872677803039551 + }, + { + "auxiliary_loss_clip": 0.01163409, + "auxiliary_loss_mlp": 0.00874331, + "balance_loss_clip": 1.04443812, + "balance_loss_mlp": 1.00007677, + "epoch": 0.1968376119761919, + "flos": 23842279607040.0, + "grad_norm": 1.6372312971838097, + "language_loss": 0.77261299, + "learning_rate": 3.715194354570169e-06, + "loss": 0.79299033, + "num_input_tokens_seen": 34698755, + "step": 1637, + "time_per_iteration": 2.728691339492798 + }, + { + "auxiliary_loss_clip": 0.01158338, + "auxiliary_loss_mlp": 0.01091668, + "balance_loss_clip": 1.04172909, + "balance_loss_mlp": 1.00946176, + "epoch": 0.196957854866831, + "flos": 18113917409280.0, + "grad_norm": 2.4027313383277624, + "language_loss": 0.83270085, + "learning_rate": 3.714793582256809e-06, + "loss": 0.85520089, + "num_input_tokens_seen": 34715820, + "step": 1638, + "time_per_iteration": 2.8455758094787598 + }, + { + "auxiliary_loss_clip": 0.01169017, + "auxiliary_loss_mlp": 0.01088787, + "balance_loss_clip": 1.04333472, + "balance_loss_mlp": 1.00648534, + "epoch": 0.1970780977574701, + "flos": 21653129312640.0, + "grad_norm": 2.1412754792259916, + "language_loss": 0.85176742, + "learning_rate": 3.7143925498106253e-06, + "loss": 0.87434554, + "num_input_tokens_seen": 34734360, + "step": 1639, + "time_per_iteration": 3.5952513217926025 + }, + { + "auxiliary_loss_clip": 0.01151227, + "auxiliary_loss_mlp": 0.01092685, + "balance_loss_clip": 1.04055417, + "balance_loss_mlp": 1.01009679, + "epoch": 0.19719834064810918, + "flos": 20811813984000.0, + "grad_norm": 1.8695939767048129, + "language_loss": 0.79746556, + "learning_rate": 3.7139912572924558e-06, + "loss": 0.81990469, + "num_input_tokens_seen": 34753390, + "step": 1640, + "time_per_iteration": 2.7515347003936768 + }, + { + "auxiliary_loss_clip": 0.01159685, + "auxiliary_loss_mlp": 0.01089506, + "balance_loss_clip": 1.04141355, + "balance_loss_mlp": 1.00715637, + "epoch": 0.19731858353874826, + "flos": 23434800744960.0, + "grad_norm": 2.5572370725181286, + "language_loss": 0.80795461, + "learning_rate": 3.7135897047631744e-06, + "loss": 0.83044654, + "num_input_tokens_seen": 34771275, + "step": 1641, + "time_per_iteration": 2.761442184448242 + }, + { + "auxiliary_loss_clip": 0.01149482, + "auxiliary_loss_mlp": 0.01091031, + "balance_loss_clip": 1.04046333, + "balance_loss_mlp": 1.00863314, + "epoch": 0.19743882642938737, + "flos": 23988184652160.0, + "grad_norm": 1.9321793958246816, + "language_loss": 0.75987238, + "learning_rate": 3.713187892283698e-06, + "loss": 0.78227746, + "num_input_tokens_seen": 34790885, + "step": 1642, + "time_per_iteration": 2.812443256378174 + }, + { + "auxiliary_loss_clip": 0.01129124, + "auxiliary_loss_mlp": 0.0109187, + "balance_loss_clip": 1.03830838, + "balance_loss_mlp": 1.0093292, + "epoch": 0.19755906932002645, + "flos": 15004340081280.0, + "grad_norm": 2.068463858915478, + "language_loss": 0.87383258, + "learning_rate": 3.71278581991498e-06, + "loss": 0.89604247, + "num_input_tokens_seen": 34806745, + "step": 1643, + "time_per_iteration": 2.8978500366210938 + }, + { + "auxiliary_loss_clip": 0.01129523, + "auxiliary_loss_mlp": 0.00874378, + "balance_loss_clip": 1.03340542, + "balance_loss_mlp": 1.00007463, + "epoch": 0.19767931221066554, + "flos": 19494466686720.0, + "grad_norm": 1.7712661209497171, + "language_loss": 0.7897433, + "learning_rate": 3.712383487718015e-06, + "loss": 0.80978233, + "num_input_tokens_seen": 34824985, + "step": 1644, + "time_per_iteration": 3.7543280124664307 + }, + { + "auxiliary_loss_clip": 0.01117065, + "auxiliary_loss_mlp": 0.01090849, + "balance_loss_clip": 1.03873122, + "balance_loss_mlp": 1.00859475, + "epoch": 0.19779955510130465, + "flos": 25737895958400.0, + "grad_norm": 1.8409360619550248, + "language_loss": 0.86415446, + "learning_rate": 3.7119808957538365e-06, + "loss": 0.88623363, + "num_input_tokens_seen": 34843980, + "step": 1645, + "time_per_iteration": 2.9227135181427 + }, + { + "auxiliary_loss_clip": 0.0115025, + "auxiliary_loss_mlp": 0.0109447, + "balance_loss_clip": 1.04017854, + "balance_loss_mlp": 1.01192904, + "epoch": 0.19791979799194373, + "flos": 20777699041920.0, + "grad_norm": 2.0175040787403753, + "language_loss": 0.8048833, + "learning_rate": 3.711578044083517e-06, + "loss": 0.82733047, + "num_input_tokens_seen": 34860780, + "step": 1646, + "time_per_iteration": 3.6864724159240723 + }, + { + "auxiliary_loss_clip": 0.01147477, + "auxiliary_loss_mlp": 0.01089452, + "balance_loss_clip": 1.03916216, + "balance_loss_mlp": 1.00719774, + "epoch": 0.1980400408825828, + "flos": 25589010084480.0, + "grad_norm": 1.7222955333113932, + "language_loss": 0.74736637, + "learning_rate": 3.7111749327681698e-06, + "loss": 0.76973563, + "num_input_tokens_seen": 34880815, + "step": 1647, + "time_per_iteration": 2.7395312786102295 + }, + { + "auxiliary_loss_clip": 0.01159799, + "auxiliary_loss_mlp": 0.01088464, + "balance_loss_clip": 1.04212308, + "balance_loss_mlp": 1.00611401, + "epoch": 0.1981602837732219, + "flos": 23513840622720.0, + "grad_norm": 2.0828019046297164, + "language_loss": 0.86312014, + "learning_rate": 3.7107715618689455e-06, + "loss": 0.88560277, + "num_input_tokens_seen": 34899790, + "step": 1648, + "time_per_iteration": 3.687004804611206 + }, + { + "auxiliary_loss_clip": 0.01151547, + "auxiliary_loss_mlp": 0.01089572, + "balance_loss_clip": 1.04076326, + "balance_loss_mlp": 1.00726938, + "epoch": 0.198280526663861, + "flos": 23185365724800.0, + "grad_norm": 1.4228736409787754, + "language_loss": 0.83458, + "learning_rate": 3.710367931447035e-06, + "loss": 0.85699117, + "num_input_tokens_seen": 34921570, + "step": 1649, + "time_per_iteration": 2.7994470596313477 + }, + { + "auxiliary_loss_clip": 0.01159695, + "auxiliary_loss_mlp": 0.01090751, + "balance_loss_clip": 1.04146206, + "balance_loss_mlp": 1.00821066, + "epoch": 0.1984007695545001, + "flos": 21689470897920.0, + "grad_norm": 2.2185141127966634, + "language_loss": 0.86770856, + "learning_rate": 3.70996404156367e-06, + "loss": 0.89021301, + "num_input_tokens_seen": 34941205, + "step": 1650, + "time_per_iteration": 2.7304935455322266 + }, + { + "auxiliary_loss_clip": 0.01127982, + "auxiliary_loss_mlp": 0.01091152, + "balance_loss_clip": 1.03816319, + "balance_loss_mlp": 1.00889754, + "epoch": 0.19852101244513917, + "flos": 36064008887040.0, + "grad_norm": 1.6664597439515714, + "language_loss": 0.72704124, + "learning_rate": 3.7095598922801187e-06, + "loss": 0.74923259, + "num_input_tokens_seen": 34963280, + "step": 1651, + "time_per_iteration": 2.972744941711426 + }, + { + "auxiliary_loss_clip": 0.01169291, + "auxiliary_loss_mlp": 0.01089556, + "balance_loss_clip": 1.04282713, + "balance_loss_mlp": 1.00739646, + "epoch": 0.19864125533577828, + "flos": 23105894883840.0, + "grad_norm": 2.0929328253713724, + "language_loss": 0.76061916, + "learning_rate": 3.7091554836576914e-06, + "loss": 0.78320754, + "num_input_tokens_seen": 34979955, + "step": 1652, + "time_per_iteration": 2.664449453353882 + }, + { + "auxiliary_loss_clip": 0.01155997, + "auxiliary_loss_mlp": 0.00874323, + "balance_loss_clip": 1.03964829, + "balance_loss_mlp": 1.00007439, + "epoch": 0.19876149822641737, + "flos": 24608505553920.0, + "grad_norm": 1.748442514073674, + "language_loss": 0.82863069, + "learning_rate": 3.708750815757736e-06, + "loss": 0.84893394, + "num_input_tokens_seen": 35000725, + "step": 1653, + "time_per_iteration": 2.7543439865112305 + }, + { + "auxiliary_loss_clip": 0.01149768, + "auxiliary_loss_mlp": 0.01089688, + "balance_loss_clip": 1.03830671, + "balance_loss_mlp": 1.00724316, + "epoch": 0.19888174111705645, + "flos": 32196645308160.0, + "grad_norm": 2.296169342845324, + "language_loss": 0.72900224, + "learning_rate": 3.7083458886416407e-06, + "loss": 0.75139678, + "num_input_tokens_seen": 35019920, + "step": 1654, + "time_per_iteration": 2.780132293701172 + }, + { + "auxiliary_loss_clip": 0.01112993, + "auxiliary_loss_mlp": 0.01089145, + "balance_loss_clip": 1.03523421, + "balance_loss_mlp": 1.00689054, + "epoch": 0.19900198400769553, + "flos": 24608469640320.0, + "grad_norm": 1.9006009813245661, + "language_loss": 0.88212955, + "learning_rate": 3.707940702370832e-06, + "loss": 0.90415084, + "num_input_tokens_seen": 35040765, + "step": 1655, + "time_per_iteration": 2.9223132133483887 + }, + { + "auxiliary_loss_clip": 0.01149247, + "auxiliary_loss_mlp": 0.01080345, + "balance_loss_clip": 1.04402518, + "balance_loss_mlp": 1.00023687, + "epoch": 0.19912222689833464, + "flos": 67915805673600.0, + "grad_norm": 0.7701458540396404, + "language_loss": 0.58286595, + "learning_rate": 3.707535257006777e-06, + "loss": 0.60516185, + "num_input_tokens_seen": 35106390, + "step": 1656, + "time_per_iteration": 3.3509695529937744 + }, + { + "auxiliary_loss_clip": 0.01140874, + "auxiliary_loss_mlp": 0.01091551, + "balance_loss_clip": 1.03424406, + "balance_loss_mlp": 1.00910592, + "epoch": 0.19924246978897373, + "flos": 15742340916480.0, + "grad_norm": 2.2488695651509354, + "language_loss": 0.88722473, + "learning_rate": 3.707129552610981e-06, + "loss": 0.909549, + "num_input_tokens_seen": 35125040, + "step": 1657, + "time_per_iteration": 2.7927894592285156 + }, + { + "auxiliary_loss_clip": 0.01148876, + "auxiliary_loss_mlp": 0.01089662, + "balance_loss_clip": 1.04056358, + "balance_loss_mlp": 1.00721729, + "epoch": 0.1993627126796128, + "flos": 17566566986880.0, + "grad_norm": 1.9336480385428307, + "language_loss": 0.74010444, + "learning_rate": 3.70672358924499e-06, + "loss": 0.7624898, + "num_input_tokens_seen": 35144280, + "step": 1658, + "time_per_iteration": 2.756869077682495 + }, + { + "auxiliary_loss_clip": 0.01134849, + "auxiliary_loss_mlp": 0.01089904, + "balance_loss_clip": 1.03717816, + "balance_loss_mlp": 1.00779283, + "epoch": 0.19948295557025192, + "flos": 40843826680320.0, + "grad_norm": 1.905464253523367, + "language_loss": 0.78954315, + "learning_rate": 3.706317366970386e-06, + "loss": 0.8117907, + "num_input_tokens_seen": 35165280, + "step": 1659, + "time_per_iteration": 2.915550708770752 + }, + { + "auxiliary_loss_clip": 0.01167143, + "auxiliary_loss_mlp": 0.00874341, + "balance_loss_clip": 1.0406456, + "balance_loss_mlp": 1.00002563, + "epoch": 0.199603198460891, + "flos": 25082418620160.0, + "grad_norm": 1.8338373204440876, + "language_loss": 0.84370643, + "learning_rate": 3.705910885848795e-06, + "loss": 0.86412132, + "num_input_tokens_seen": 35183655, + "step": 1660, + "time_per_iteration": 2.744084596633911 + }, + { + "auxiliary_loss_clip": 0.0115622, + "auxiliary_loss_mlp": 0.01091955, + "balance_loss_clip": 1.03984094, + "balance_loss_mlp": 1.00965261, + "epoch": 0.19972344135153008, + "flos": 20084120352000.0, + "grad_norm": 2.201152127849395, + "language_loss": 0.84979856, + "learning_rate": 3.705504145941879e-06, + "loss": 0.87228024, + "num_input_tokens_seen": 35201825, + "step": 1661, + "time_per_iteration": 2.676854372024536 + }, + { + "auxiliary_loss_clip": 0.01167643, + "auxiliary_loss_mlp": 0.01088823, + "balance_loss_clip": 1.04153526, + "balance_loss_mlp": 1.00661659, + "epoch": 0.1998436842421692, + "flos": 23727472761600.0, + "grad_norm": 2.594175052382042, + "language_loss": 0.78606069, + "learning_rate": 3.7050971473113403e-06, + "loss": 0.80862534, + "num_input_tokens_seen": 35221600, + "step": 1662, + "time_per_iteration": 2.677783727645874 + }, + { + "auxiliary_loss_clip": 0.01159921, + "auxiliary_loss_mlp": 0.0087419, + "balance_loss_clip": 1.04147243, + "balance_loss_mlp": 1.00002372, + "epoch": 0.19996392713280828, + "flos": 36102361633920.0, + "grad_norm": 1.6622540820018818, + "language_loss": 0.80270201, + "learning_rate": 3.7046898900189196e-06, + "loss": 0.82304311, + "num_input_tokens_seen": 35245935, + "step": 1663, + "time_per_iteration": 2.8169238567352295 + }, + { + "auxiliary_loss_clip": 0.01137784, + "auxiliary_loss_mlp": 0.01090639, + "balance_loss_clip": 1.03902042, + "balance_loss_mlp": 1.00814652, + "epoch": 0.20008417002344736, + "flos": 23657662679040.0, + "grad_norm": 1.6710312269281289, + "language_loss": 0.82997191, + "learning_rate": 3.704282374126398e-06, + "loss": 0.85225618, + "num_input_tokens_seen": 35265615, + "step": 1664, + "time_per_iteration": 3.747339963912964 + }, + { + "auxiliary_loss_clip": 0.01140866, + "auxiliary_loss_mlp": 0.01091282, + "balance_loss_clip": 1.03952932, + "balance_loss_mlp": 1.00878859, + "epoch": 0.20020441291408644, + "flos": 21872076664320.0, + "grad_norm": 1.7266466292221392, + "language_loss": 0.8752265, + "learning_rate": 3.7038745996955954e-06, + "loss": 0.89754802, + "num_input_tokens_seen": 35284960, + "step": 1665, + "time_per_iteration": 335.5954225063324 + }, + { + "auxiliary_loss_clip": 0.01134495, + "auxiliary_loss_mlp": 0.01088815, + "balance_loss_clip": 1.04027522, + "balance_loss_mlp": 1.00632226, + "epoch": 0.20032465580472555, + "flos": 23179691376000.0, + "grad_norm": 2.891539337073158, + "language_loss": 0.71808761, + "learning_rate": 3.703466566788371e-06, + "loss": 0.74032074, + "num_input_tokens_seen": 35304090, + "step": 1666, + "time_per_iteration": 2.8537368774414062 + }, + { + "auxiliary_loss_clip": 0.01142059, + "auxiliary_loss_mlp": 0.01091837, + "balance_loss_clip": 1.03576899, + "balance_loss_mlp": 1.00934446, + "epoch": 0.20044489869536464, + "flos": 23873521461120.0, + "grad_norm": 1.8222849533514163, + "language_loss": 0.74852514, + "learning_rate": 3.703058275466622e-06, + "loss": 0.77086413, + "num_input_tokens_seen": 35323325, + "step": 1667, + "time_per_iteration": 2.8300163745880127 + }, + { + "auxiliary_loss_clip": 0.01145934, + "auxiliary_loss_mlp": 0.01089503, + "balance_loss_clip": 1.03861403, + "balance_loss_mlp": 1.00729656, + "epoch": 0.20056514158600372, + "flos": 21945226711680.0, + "grad_norm": 9.951457657436407, + "language_loss": 0.77612925, + "learning_rate": 3.7026497257922877e-06, + "loss": 0.79848361, + "num_input_tokens_seen": 35343635, + "step": 1668, + "time_per_iteration": 3.8789422512054443 + }, + { + "auxiliary_loss_clip": 0.01130635, + "auxiliary_loss_mlp": 0.0109008, + "balance_loss_clip": 1.03927875, + "balance_loss_mlp": 1.00796914, + "epoch": 0.20068538447664283, + "flos": 23879159896320.0, + "grad_norm": 2.170398997751297, + "language_loss": 0.8538813, + "learning_rate": 3.7022409178273436e-06, + "loss": 0.8760885, + "num_input_tokens_seen": 35364615, + "step": 1669, + "time_per_iteration": 2.9514427185058594 + }, + { + "auxiliary_loss_clip": 0.01156174, + "auxiliary_loss_mlp": 0.01087783, + "balance_loss_clip": 1.04030943, + "balance_loss_mlp": 1.00543308, + "epoch": 0.2008056273672819, + "flos": 18442823270400.0, + "grad_norm": 2.28810506433191, + "language_loss": 0.78541631, + "learning_rate": 3.7018318516338054e-06, + "loss": 0.8078559, + "num_input_tokens_seen": 35383775, + "step": 1670, + "time_per_iteration": 2.7107138633728027 + }, + { + "auxiliary_loss_clip": 0.01159302, + "auxiliary_loss_mlp": 0.01089142, + "balance_loss_clip": 1.0413847, + "balance_loss_mlp": 1.00703108, + "epoch": 0.200925870257921, + "flos": 23659530186240.0, + "grad_norm": 2.3112863879484173, + "language_loss": 0.8188687, + "learning_rate": 3.7014225272737284e-06, + "loss": 0.84135312, + "num_input_tokens_seen": 35403000, + "step": 1671, + "time_per_iteration": 3.7721569538116455 + }, + { + "auxiliary_loss_clip": 0.01159946, + "auxiliary_loss_mlp": 0.01090807, + "balance_loss_clip": 1.04194188, + "balance_loss_mlp": 1.00831413, + "epoch": 0.20104611314856008, + "flos": 16217115909120.0, + "grad_norm": 2.3726824634905026, + "language_loss": 0.74548459, + "learning_rate": 3.701012944809207e-06, + "loss": 0.76799208, + "num_input_tokens_seen": 35420115, + "step": 1672, + "time_per_iteration": 2.780635118484497 + }, + { + "auxiliary_loss_clip": 0.01143371, + "auxiliary_loss_mlp": 0.00874252, + "balance_loss_clip": 1.03834403, + "balance_loss_mlp": 1.00004268, + "epoch": 0.2011663560391992, + "flos": 21397373498880.0, + "grad_norm": 2.2119158964073464, + "language_loss": 0.79117858, + "learning_rate": 3.700603104302374e-06, + "loss": 0.81135476, + "num_input_tokens_seen": 35439925, + "step": 1673, + "time_per_iteration": 4.452728748321533 + }, + { + "auxiliary_loss_clip": 0.01101086, + "auxiliary_loss_mlp": 0.01080791, + "balance_loss_clip": 1.03162217, + "balance_loss_mlp": 1.00106382, + "epoch": 0.20128659892983827, + "flos": 62229459409920.0, + "grad_norm": 0.8954923784716591, + "language_loss": 0.55980068, + "learning_rate": 3.7001930058154027e-06, + "loss": 0.5816195, + "num_input_tokens_seen": 35504885, + "step": 1674, + "time_per_iteration": 3.3691768646240234 + }, + { + "auxiliary_loss_clip": 0.01135971, + "auxiliary_loss_mlp": 0.01089647, + "balance_loss_clip": 1.03867483, + "balance_loss_mlp": 1.00734532, + "epoch": 0.20140684182047736, + "flos": 28438737448320.0, + "grad_norm": 2.591428678629923, + "language_loss": 0.79679012, + "learning_rate": 3.6997826494105037e-06, + "loss": 0.81904626, + "num_input_tokens_seen": 35525330, + "step": 1675, + "time_per_iteration": 2.9695701599121094 + }, + { + "auxiliary_loss_clip": 0.01144727, + "auxiliary_loss_mlp": 0.01089524, + "balance_loss_clip": 1.03738451, + "balance_loss_mlp": 1.00722241, + "epoch": 0.20152708471111647, + "flos": 28074064619520.0, + "grad_norm": 2.1813023323249334, + "language_loss": 0.69177985, + "learning_rate": 3.6993720351499286e-06, + "loss": 0.71412241, + "num_input_tokens_seen": 35546455, + "step": 1676, + "time_per_iteration": 2.913783073425293 + }, + { + "auxiliary_loss_clip": 0.01139102, + "auxiliary_loss_mlp": 0.01088244, + "balance_loss_clip": 1.03498995, + "balance_loss_mlp": 1.00613248, + "epoch": 0.20164732760175555, + "flos": 23549751244800.0, + "grad_norm": 1.8917402319750394, + "language_loss": 0.76815623, + "learning_rate": 3.6989611630959666e-06, + "loss": 0.79042971, + "num_input_tokens_seen": 35565010, + "step": 1677, + "time_per_iteration": 2.900667190551758 + }, + { + "auxiliary_loss_clip": 0.01157336, + "auxiliary_loss_mlp": 0.01079934, + "balance_loss_clip": 1.05227137, + "balance_loss_mlp": 1.00020647, + "epoch": 0.20176757049239463, + "flos": 71100616037760.0, + "grad_norm": 0.679214022331348, + "language_loss": 0.58331954, + "learning_rate": 3.6985500333109474e-06, + "loss": 0.60569227, + "num_input_tokens_seen": 35633340, + "step": 1678, + "time_per_iteration": 3.4056694507598877 + }, + { + "auxiliary_loss_clip": 0.01139864, + "auxiliary_loss_mlp": 0.01090754, + "balance_loss_clip": 1.04068244, + "balance_loss_mlp": 1.00859511, + "epoch": 0.20188781338303372, + "flos": 21430159637760.0, + "grad_norm": 2.906474194972661, + "language_loss": 0.76519233, + "learning_rate": 3.6981386458572385e-06, + "loss": 0.78749847, + "num_input_tokens_seen": 35651315, + "step": 1679, + "time_per_iteration": 2.8275368213653564 + }, + { + "auxiliary_loss_clip": 0.0113617, + "auxiliary_loss_mlp": 0.01091516, + "balance_loss_clip": 1.03681135, + "balance_loss_mlp": 1.0090704, + "epoch": 0.20200805627367283, + "flos": 11546215130880.0, + "grad_norm": 3.231549740218615, + "language_loss": 0.76160789, + "learning_rate": 3.6977270007972468e-06, + "loss": 0.78388476, + "num_input_tokens_seen": 35668850, + "step": 1680, + "time_per_iteration": 2.9096102714538574 + }, + { + "auxiliary_loss_clip": 0.01147555, + "auxiliary_loss_mlp": 0.01089576, + "balance_loss_clip": 1.03924441, + "balance_loss_mlp": 1.00746417, + "epoch": 0.2021282991643119, + "flos": 28545391906560.0, + "grad_norm": 2.729023132600083, + "language_loss": 0.72377479, + "learning_rate": 3.6973150981934196e-06, + "loss": 0.74614614, + "num_input_tokens_seen": 35690080, + "step": 1681, + "time_per_iteration": 2.8248510360717773 + }, + { + "auxiliary_loss_clip": 0.01168335, + "auxiliary_loss_mlp": 0.01088026, + "balance_loss_clip": 1.0427742, + "balance_loss_mlp": 1.00567591, + "epoch": 0.202248542054951, + "flos": 17923446564480.0, + "grad_norm": 2.388674426558781, + "language_loss": 0.83446681, + "learning_rate": 3.6969029381082415e-06, + "loss": 0.85703045, + "num_input_tokens_seen": 35706075, + "step": 1682, + "time_per_iteration": 2.698563575744629 + }, + { + "auxiliary_loss_clip": 0.01139651, + "auxiliary_loss_mlp": 0.01088813, + "balance_loss_clip": 1.04131293, + "balance_loss_mlp": 1.00679731, + "epoch": 0.2023687849455901, + "flos": 19864634296320.0, + "grad_norm": 1.9330268038007647, + "language_loss": 0.79369533, + "learning_rate": 3.696490520604237e-06, + "loss": 0.81597996, + "num_input_tokens_seen": 35724765, + "step": 1683, + "time_per_iteration": 2.8423502445220947 + }, + { + "auxiliary_loss_clip": 0.01155988, + "auxiliary_loss_mlp": 0.01091163, + "balance_loss_clip": 1.04054081, + "balance_loss_mlp": 1.00938559, + "epoch": 0.20248902783622919, + "flos": 22564721600640.0, + "grad_norm": 1.7073951020515052, + "language_loss": 0.80700278, + "learning_rate": 3.696077845743968e-06, + "loss": 0.82947433, + "num_input_tokens_seen": 35744355, + "step": 1684, + "time_per_iteration": 2.7539236545562744 + }, + { + "auxiliary_loss_clip": 0.01166859, + "auxiliary_loss_mlp": 0.01090601, + "balance_loss_clip": 1.04095197, + "balance_loss_mlp": 1.00806057, + "epoch": 0.20260927072686827, + "flos": 22709728805760.0, + "grad_norm": 2.37121787299676, + "language_loss": 0.73324859, + "learning_rate": 3.69566491359004e-06, + "loss": 0.75582314, + "num_input_tokens_seen": 35761000, + "step": 1685, + "time_per_iteration": 2.7503066062927246 + }, + { + "auxiliary_loss_clip": 0.01148363, + "auxiliary_loss_mlp": 0.01090451, + "balance_loss_clip": 1.03922498, + "balance_loss_mlp": 1.00814939, + "epoch": 0.20272951361750738, + "flos": 51023998650240.0, + "grad_norm": 1.9253864401864347, + "language_loss": 0.69433916, + "learning_rate": 3.695251724205092e-06, + "loss": 0.71672732, + "num_input_tokens_seen": 35785360, + "step": 1686, + "time_per_iteration": 3.0538575649261475 + }, + { + "auxiliary_loss_clip": 0.01168255, + "auxiliary_loss_mlp": 0.01090613, + "balance_loss_clip": 1.04252291, + "balance_loss_mlp": 1.00835872, + "epoch": 0.20284975650814646, + "flos": 26578133879040.0, + "grad_norm": 1.5764829255174788, + "language_loss": 0.86628366, + "learning_rate": 3.6948382776518054e-06, + "loss": 0.88887239, + "num_input_tokens_seen": 35806065, + "step": 1687, + "time_per_iteration": 2.8204545974731445 + }, + { + "auxiliary_loss_clip": 0.01137056, + "auxiliary_loss_mlp": 0.01089772, + "balance_loss_clip": 1.0373956, + "balance_loss_mlp": 1.00751805, + "epoch": 0.20296999939878554, + "flos": 16034222833920.0, + "grad_norm": 9.136211954850086, + "language_loss": 0.79454559, + "learning_rate": 3.6944245739929e-06, + "loss": 0.81681389, + "num_input_tokens_seen": 35822225, + "step": 1688, + "time_per_iteration": 2.8601982593536377 + }, + { + "auxiliary_loss_clip": 0.01155093, + "auxiliary_loss_mlp": 0.01090891, + "balance_loss_clip": 1.03898263, + "balance_loss_mlp": 1.00839806, + "epoch": 0.20309024228942463, + "flos": 19203374868480.0, + "grad_norm": 2.5621694282610354, + "language_loss": 0.71577322, + "learning_rate": 3.6940106132911332e-06, + "loss": 0.73823309, + "num_input_tokens_seen": 35839410, + "step": 1689, + "time_per_iteration": 3.7471792697906494 + }, + { + "auxiliary_loss_clip": 0.01157554, + "auxiliary_loss_mlp": 0.01090425, + "balance_loss_clip": 1.04088497, + "balance_loss_mlp": 1.00821865, + "epoch": 0.20321048518006374, + "flos": 22821087945600.0, + "grad_norm": 1.8839122086370306, + "language_loss": 0.88642246, + "learning_rate": 3.6935963956093037e-06, + "loss": 0.90890217, + "num_input_tokens_seen": 35859495, + "step": 1690, + "time_per_iteration": 2.894160270690918 + }, + { + "auxiliary_loss_clip": 0.01159459, + "auxiliary_loss_mlp": 0.01089078, + "balance_loss_clip": 1.04231739, + "balance_loss_mlp": 1.00691938, + "epoch": 0.20333072807070282, + "flos": 19096397187840.0, + "grad_norm": 1.7813759391808968, + "language_loss": 0.69131929, + "learning_rate": 3.6931819210102474e-06, + "loss": 0.71380466, + "num_input_tokens_seen": 35878890, + "step": 1691, + "time_per_iteration": 2.789693593978882 + }, + { + "auxiliary_loss_clip": 0.01168058, + "auxiliary_loss_mlp": 0.01090208, + "balance_loss_clip": 1.04261971, + "balance_loss_mlp": 1.00771546, + "epoch": 0.2034509709613419, + "flos": 18180962144640.0, + "grad_norm": 1.8947059262028818, + "language_loss": 0.84733927, + "learning_rate": 3.6927671895568402e-06, + "loss": 0.86992192, + "num_input_tokens_seen": 35897950, + "step": 1692, + "time_per_iteration": 2.7904746532440186 + }, + { + "auxiliary_loss_clip": 0.0116866, + "auxiliary_loss_mlp": 0.0109137, + "balance_loss_clip": 1.04290247, + "balance_loss_mlp": 1.00897253, + "epoch": 0.20357121385198101, + "flos": 22923899648640.0, + "grad_norm": 2.874617692747027, + "language_loss": 0.86793917, + "learning_rate": 3.692352201311996e-06, + "loss": 0.89053947, + "num_input_tokens_seen": 35916800, + "step": 1693, + "time_per_iteration": 2.7192537784576416 + }, + { + "auxiliary_loss_clip": 0.01136788, + "auxiliary_loss_mlp": 0.01088656, + "balance_loss_clip": 1.03950202, + "balance_loss_mlp": 1.00644946, + "epoch": 0.2036914567426201, + "flos": 20922131629440.0, + "grad_norm": 1.8102097699856234, + "language_loss": 0.76524508, + "learning_rate": 3.6919369563386687e-06, + "loss": 0.78749961, + "num_input_tokens_seen": 35936600, + "step": 1694, + "time_per_iteration": 3.84299898147583 + }, + { + "auxiliary_loss_clip": 0.01140618, + "auxiliary_loss_mlp": 0.01089391, + "balance_loss_clip": 1.03787827, + "balance_loss_mlp": 1.00737476, + "epoch": 0.20381169963325918, + "flos": 15519155760000.0, + "grad_norm": 1.9458696196055474, + "language_loss": 0.79157841, + "learning_rate": 3.69152145469985e-06, + "loss": 0.81387854, + "num_input_tokens_seen": 35953645, + "step": 1695, + "time_per_iteration": 2.7963125705718994 + }, + { + "auxiliary_loss_clip": 0.01126917, + "auxiliary_loss_mlp": 0.01089664, + "balance_loss_clip": 1.03732932, + "balance_loss_mlp": 1.0072186, + "epoch": 0.20393194252389826, + "flos": 28833143760000.0, + "grad_norm": 1.863424454048975, + "language_loss": 0.81811482, + "learning_rate": 3.691105696458572e-06, + "loss": 0.84028065, + "num_input_tokens_seen": 35970940, + "step": 1696, + "time_per_iteration": 2.89083194732666 + }, + { + "auxiliary_loss_clip": 0.01170138, + "auxiliary_loss_mlp": 0.01090259, + "balance_loss_clip": 1.04491711, + "balance_loss_mlp": 1.00819564, + "epoch": 0.20405218541453737, + "flos": 22488554810880.0, + "grad_norm": 2.7871213981079475, + "language_loss": 0.68604457, + "learning_rate": 3.690689681677904e-06, + "loss": 0.70864856, + "num_input_tokens_seen": 35989410, + "step": 1697, + "time_per_iteration": 3.687525987625122 + }, + { + "auxiliary_loss_clip": 0.01147952, + "auxiliary_loss_mlp": 0.01089782, + "balance_loss_clip": 1.03934646, + "balance_loss_mlp": 1.00776577, + "epoch": 0.20417242830517646, + "flos": 25374408278400.0, + "grad_norm": 1.7810608114813329, + "language_loss": 0.88539463, + "learning_rate": 3.690273410420956e-06, + "loss": 0.90777194, + "num_input_tokens_seen": 36009175, + "step": 1698, + "time_per_iteration": 2.9110498428344727 + }, + { + "auxiliary_loss_clip": 0.01153261, + "auxiliary_loss_mlp": 0.01088885, + "balance_loss_clip": 1.03800035, + "balance_loss_mlp": 1.00672626, + "epoch": 0.20429267119581554, + "flos": 14793078240000.0, + "grad_norm": 2.7530927843347888, + "language_loss": 0.76398927, + "learning_rate": 3.689856882750875e-06, + "loss": 0.78641081, + "num_input_tokens_seen": 36024375, + "step": 1699, + "time_per_iteration": 4.077358245849609 + }, + { + "auxiliary_loss_clip": 0.01157002, + "auxiliary_loss_mlp": 0.01089561, + "balance_loss_clip": 1.04194629, + "balance_loss_mlp": 1.00735402, + "epoch": 0.20441291408645465, + "flos": 17781851151360.0, + "grad_norm": 1.6278093145977226, + "language_loss": 0.78708023, + "learning_rate": 3.6894400987308486e-06, + "loss": 0.80954587, + "num_input_tokens_seen": 36041895, + "step": 1700, + "time_per_iteration": 2.744781732559204 + }, + { + "auxiliary_loss_clip": 0.01157405, + "auxiliary_loss_mlp": 0.01089619, + "balance_loss_clip": 1.04012847, + "balance_loss_mlp": 1.00707817, + "epoch": 0.20453315697709373, + "flos": 16435668211200.0, + "grad_norm": 1.9619448023995512, + "language_loss": 0.84828252, + "learning_rate": 3.6890230584241024e-06, + "loss": 0.87075275, + "num_input_tokens_seen": 36058825, + "step": 1701, + "time_per_iteration": 2.7270750999450684 + }, + { + "auxiliary_loss_clip": 0.0116332, + "auxiliary_loss_mlp": 0.01079723, + "balance_loss_clip": 1.05053949, + "balance_loss_mlp": 0.99999571, + "epoch": 0.20465339986773282, + "flos": 66713085653760.0, + "grad_norm": 1.10022358726827, + "language_loss": 0.66358888, + "learning_rate": 3.6886057618939016e-06, + "loss": 0.6860193, + "num_input_tokens_seen": 36121645, + "step": 1702, + "time_per_iteration": 3.3728818893432617 + }, + { + "auxiliary_loss_clip": 0.01132816, + "auxiliary_loss_mlp": 0.01090367, + "balance_loss_clip": 1.04285634, + "balance_loss_mlp": 1.00825584, + "epoch": 0.2047736427583719, + "flos": 41974114924800.0, + "grad_norm": 2.104203601840046, + "language_loss": 0.69051838, + "learning_rate": 3.6881882092035492e-06, + "loss": 0.7127502, + "num_input_tokens_seen": 36143030, + "step": 1703, + "time_per_iteration": 3.0853443145751953 + }, + { + "auxiliary_loss_clip": 0.01138321, + "auxiliary_loss_mlp": 0.00873597, + "balance_loss_clip": 1.04965353, + "balance_loss_mlp": 1.0000304, + "epoch": 0.204893885649011, + "flos": 69940878641280.0, + "grad_norm": 0.9215606391743302, + "language_loss": 0.6125797, + "learning_rate": 3.6877704004163873e-06, + "loss": 0.63269889, + "num_input_tokens_seen": 36203435, + "step": 1704, + "time_per_iteration": 3.612745523452759 + }, + { + "auxiliary_loss_clip": 0.01169479, + "auxiliary_loss_mlp": 0.01090466, + "balance_loss_clip": 1.04391944, + "balance_loss_mlp": 1.00821185, + "epoch": 0.2050141285396501, + "flos": 22200012858240.0, + "grad_norm": 1.685661784420749, + "language_loss": 0.77700347, + "learning_rate": 3.6873523355957984e-06, + "loss": 0.79960293, + "num_input_tokens_seen": 36222435, + "step": 1705, + "time_per_iteration": 2.8634610176086426 + }, + { + "auxiliary_loss_clip": 0.01162442, + "auxiliary_loss_mlp": 0.01079798, + "balance_loss_clip": 1.04983044, + "balance_loss_mlp": 1.00007045, + "epoch": 0.20513437143028918, + "flos": 46283721730560.0, + "grad_norm": 0.9926457674479467, + "language_loss": 0.64171553, + "learning_rate": 3.686934014805201e-06, + "loss": 0.6641379, + "num_input_tokens_seen": 36273065, + "step": 1706, + "time_per_iteration": 3.1411190032958984 + }, + { + "auxiliary_loss_clip": 0.01156256, + "auxiliary_loss_mlp": 0.01092685, + "balance_loss_clip": 1.04089427, + "balance_loss_mlp": 1.01038301, + "epoch": 0.20525461432092829, + "flos": 21904324099200.0, + "grad_norm": 1.8058713294819981, + "language_loss": 0.80706799, + "learning_rate": 3.6865154381080552e-06, + "loss": 0.82955742, + "num_input_tokens_seen": 36293750, + "step": 1707, + "time_per_iteration": 2.88077712059021 + }, + { + "auxiliary_loss_clip": 0.01104711, + "auxiliary_loss_mlp": 0.01088951, + "balance_loss_clip": 1.03320479, + "balance_loss_mlp": 1.00698256, + "epoch": 0.20537485721156737, + "flos": 21214264942080.0, + "grad_norm": 3.0782810089782795, + "language_loss": 0.82277882, + "learning_rate": 3.6860966055678585e-06, + "loss": 0.84471548, + "num_input_tokens_seen": 36310105, + "step": 1708, + "time_per_iteration": 3.0000882148742676 + }, + { + "auxiliary_loss_clip": 0.01153809, + "auxiliary_loss_mlp": 0.01090587, + "balance_loss_clip": 1.03884554, + "balance_loss_mlp": 1.00823724, + "epoch": 0.20549510010220645, + "flos": 20191205773440.0, + "grad_norm": 1.8230796984548847, + "language_loss": 0.86659527, + "learning_rate": 3.685677517248147e-06, + "loss": 0.88903928, + "num_input_tokens_seen": 36328995, + "step": 1709, + "time_per_iteration": 2.715757131576538 + }, + { + "auxiliary_loss_clip": 0.01146771, + "auxiliary_loss_mlp": 0.00874167, + "balance_loss_clip": 1.04181337, + "balance_loss_mlp": 1.00004017, + "epoch": 0.20561534299284553, + "flos": 17016702612480.0, + "grad_norm": 2.709457765665522, + "language_loss": 0.80165362, + "learning_rate": 3.6852581732124967e-06, + "loss": 0.821863, + "num_input_tokens_seen": 36346340, + "step": 1710, + "time_per_iteration": 2.8625566959381104 + }, + { + "auxiliary_loss_clip": 0.01155017, + "auxiliary_loss_mlp": 0.01091445, + "balance_loss_clip": 1.04029191, + "balance_loss_mlp": 1.00900042, + "epoch": 0.20573558588348465, + "flos": 22890467064960.0, + "grad_norm": 1.7257767411503795, + "language_loss": 0.76105785, + "learning_rate": 3.6848385735245213e-06, + "loss": 0.78352249, + "num_input_tokens_seen": 36365430, + "step": 1711, + "time_per_iteration": 2.8335771560668945 + }, + { + "auxiliary_loss_clip": 0.01155852, + "auxiliary_loss_mlp": 0.01090227, + "balance_loss_clip": 1.04024088, + "balance_loss_mlp": 1.00811517, + "epoch": 0.20585582877412373, + "flos": 24643123286400.0, + "grad_norm": 1.8035663970268656, + "language_loss": 0.86049414, + "learning_rate": 3.6844187182478734e-06, + "loss": 0.88295484, + "num_input_tokens_seen": 36386285, + "step": 1712, + "time_per_iteration": 2.813650131225586 + }, + { + "auxiliary_loss_clip": 0.01146016, + "auxiliary_loss_mlp": 0.01089998, + "balance_loss_clip": 1.0393064, + "balance_loss_mlp": 1.00788689, + "epoch": 0.2059760716647628, + "flos": 24206952435840.0, + "grad_norm": 3.3874736319567154, + "language_loss": 0.74597692, + "learning_rate": 3.683998607446246e-06, + "loss": 0.76833707, + "num_input_tokens_seen": 36404935, + "step": 1713, + "time_per_iteration": 2.8866260051727295 + }, + { + "auxiliary_loss_clip": 0.01158265, + "auxiliary_loss_mlp": 0.01090037, + "balance_loss_clip": 1.041942, + "balance_loss_mlp": 1.00797284, + "epoch": 0.20609631455540192, + "flos": 20229522606720.0, + "grad_norm": 1.9030700292566287, + "language_loss": 0.75133574, + "learning_rate": 3.6835782411833686e-06, + "loss": 0.77381879, + "num_input_tokens_seen": 36424455, + "step": 1714, + "time_per_iteration": 2.9491043090820312 + }, + { + "auxiliary_loss_clip": 0.01135418, + "auxiliary_loss_mlp": 0.01090108, + "balance_loss_clip": 1.03824139, + "balance_loss_mlp": 1.00799656, + "epoch": 0.206216557446041, + "flos": 19864957518720.0, + "grad_norm": 1.6374482796413208, + "language_loss": 0.74288833, + "learning_rate": 3.68315761952301e-06, + "loss": 0.76514357, + "num_input_tokens_seen": 36441685, + "step": 1715, + "time_per_iteration": 3.7358949184417725 + }, + { + "auxiliary_loss_clip": 0.01166535, + "auxiliary_loss_mlp": 0.01090734, + "balance_loss_clip": 1.04154325, + "balance_loss_mlp": 1.00847936, + "epoch": 0.2063368003366801, + "flos": 24096311568000.0, + "grad_norm": 2.0874274587054713, + "language_loss": 0.83393747, + "learning_rate": 3.6827367425289797e-06, + "loss": 0.85651016, + "num_input_tokens_seen": 36461460, + "step": 1716, + "time_per_iteration": 2.8232970237731934 + }, + { + "auxiliary_loss_clip": 0.01145305, + "auxiliary_loss_mlp": 0.01091127, + "balance_loss_clip": 1.03903246, + "balance_loss_mlp": 1.00882506, + "epoch": 0.2064570432273192, + "flos": 20340163474560.0, + "grad_norm": 2.603534834472442, + "language_loss": 0.72656393, + "learning_rate": 3.6823156102651225e-06, + "loss": 0.74892825, + "num_input_tokens_seen": 36479615, + "step": 1717, + "time_per_iteration": 2.899954319000244 + }, + { + "auxiliary_loss_clip": 0.01101594, + "auxiliary_loss_mlp": 0.01089981, + "balance_loss_clip": 1.02933264, + "balance_loss_mlp": 1.0078218, + "epoch": 0.20657728611795828, + "flos": 20520363029760.0, + "grad_norm": 1.7477693279380548, + "language_loss": 0.70953369, + "learning_rate": 3.6818942227953257e-06, + "loss": 0.73144948, + "num_input_tokens_seen": 36500160, + "step": 1718, + "time_per_iteration": 3.0393946170806885 + }, + { + "auxiliary_loss_clip": 0.01134593, + "auxiliary_loss_mlp": 0.01089776, + "balance_loss_clip": 1.03689432, + "balance_loss_mlp": 1.0076642, + "epoch": 0.20669752900859736, + "flos": 21799285752960.0, + "grad_norm": 1.9405483713365306, + "language_loss": 0.68850505, + "learning_rate": 3.681472580183512e-06, + "loss": 0.71074879, + "num_input_tokens_seen": 36518810, + "step": 1719, + "time_per_iteration": 2.8385541439056396 + }, + { + "auxiliary_loss_clip": 0.01157031, + "auxiliary_loss_mlp": 0.01090357, + "balance_loss_clip": 1.04224277, + "balance_loss_mlp": 1.00843596, + "epoch": 0.20681777189923645, + "flos": 15122020014720.0, + "grad_norm": 1.8029687626104844, + "language_loss": 0.86439735, + "learning_rate": 3.6810506824936455e-06, + "loss": 0.88687122, + "num_input_tokens_seen": 36536890, + "step": 1720, + "time_per_iteration": 3.7731218338012695 + }, + { + "auxiliary_loss_clip": 0.01142605, + "auxiliary_loss_mlp": 0.01079718, + "balance_loss_clip": 1.04689384, + "balance_loss_mlp": 0.99999058, + "epoch": 0.20693801478987556, + "flos": 56481021509760.0, + "grad_norm": 1.1006526276035968, + "language_loss": 0.62586296, + "learning_rate": 3.680628529789726e-06, + "loss": 0.64808619, + "num_input_tokens_seen": 36589300, + "step": 1721, + "time_per_iteration": 3.247652053833008 + }, + { + "auxiliary_loss_clip": 0.01167912, + "auxiliary_loss_mlp": 0.01090509, + "balance_loss_clip": 1.04262888, + "balance_loss_mlp": 1.00801623, + "epoch": 0.20705825768051464, + "flos": 21614201948160.0, + "grad_norm": 1.8820028952539711, + "language_loss": 0.86321414, + "learning_rate": 3.680206122135796e-06, + "loss": 0.88579834, + "num_input_tokens_seen": 36609905, + "step": 1722, + "time_per_iteration": 2.7229456901550293 + }, + { + "auxiliary_loss_clip": 0.01128351, + "auxiliary_loss_mlp": 0.01091329, + "balance_loss_clip": 1.03882408, + "balance_loss_mlp": 1.00940824, + "epoch": 0.20717850057115372, + "flos": 25848895962240.0, + "grad_norm": 1.7636686229171272, + "language_loss": 0.78347778, + "learning_rate": 3.6797834595959323e-06, + "loss": 0.80567455, + "num_input_tokens_seen": 36629805, + "step": 1723, + "time_per_iteration": 3.831005573272705 + }, + { + "auxiliary_loss_clip": 0.01125219, + "auxiliary_loss_mlp": 0.01090946, + "balance_loss_clip": 1.0365063, + "balance_loss_mlp": 1.00854862, + "epoch": 0.20729874346179283, + "flos": 29130807767040.0, + "grad_norm": 4.768229786663042, + "language_loss": 0.77633196, + "learning_rate": 3.679360542234254e-06, + "loss": 0.79849356, + "num_input_tokens_seen": 36649150, + "step": 1724, + "time_per_iteration": 3.8894355297088623 + }, + { + "auxiliary_loss_clip": 0.01149353, + "auxiliary_loss_mlp": 0.00874207, + "balance_loss_clip": 1.04069686, + "balance_loss_mlp": 0.99999189, + "epoch": 0.20741898635243192, + "flos": 29023363209600.0, + "grad_norm": 1.7355893138204153, + "language_loss": 0.72456849, + "learning_rate": 3.678937370114916e-06, + "loss": 0.74480414, + "num_input_tokens_seen": 36668955, + "step": 1725, + "time_per_iteration": 2.8797168731689453 + }, + { + "auxiliary_loss_clip": 0.01143544, + "auxiliary_loss_mlp": 0.0108815, + "balance_loss_clip": 1.03805637, + "balance_loss_mlp": 1.00642025, + "epoch": 0.207539229243071, + "flos": 15559447841280.0, + "grad_norm": 4.038508705060955, + "language_loss": 0.78797209, + "learning_rate": 3.678513943302114e-06, + "loss": 0.81028903, + "num_input_tokens_seen": 36685730, + "step": 1726, + "time_per_iteration": 2.868926525115967 + }, + { + "auxiliary_loss_clip": 0.01166354, + "auxiliary_loss_mlp": 0.01090232, + "balance_loss_clip": 1.04178214, + "balance_loss_mlp": 1.00835884, + "epoch": 0.20765947213371008, + "flos": 20521081301760.0, + "grad_norm": 1.6130501731473252, + "language_loss": 0.84742451, + "learning_rate": 3.678090261860082e-06, + "loss": 0.86999041, + "num_input_tokens_seen": 36705460, + "step": 1727, + "time_per_iteration": 2.755845308303833 + }, + { + "auxiliary_loss_clip": 0.01136242, + "auxiliary_loss_mlp": 0.01090423, + "balance_loss_clip": 1.03709257, + "balance_loss_mlp": 1.00854981, + "epoch": 0.2077797150243492, + "flos": 19354415558400.0, + "grad_norm": 2.0447598267257945, + "language_loss": 0.77588576, + "learning_rate": 3.6776663258530906e-06, + "loss": 0.79815233, + "num_input_tokens_seen": 36724110, + "step": 1728, + "time_per_iteration": 2.8460943698883057 + }, + { + "auxiliary_loss_clip": 0.0115763, + "auxiliary_loss_mlp": 0.01090357, + "balance_loss_clip": 1.04131007, + "balance_loss_mlp": 1.00834095, + "epoch": 0.20789995791498828, + "flos": 21829952989440.0, + "grad_norm": 2.979864820499342, + "language_loss": 0.71595061, + "learning_rate": 3.6772421353454516e-06, + "loss": 0.7384305, + "num_input_tokens_seen": 36742705, + "step": 1729, + "time_per_iteration": 2.8417141437530518 + }, + { + "auxiliary_loss_clip": 0.01154192, + "auxiliary_loss_mlp": 0.01089424, + "balance_loss_clip": 1.03952038, + "balance_loss_mlp": 1.0073123, + "epoch": 0.20802020080562736, + "flos": 23148844571520.0, + "grad_norm": 1.9158512953223246, + "language_loss": 0.88769972, + "learning_rate": 3.6768176904015153e-06, + "loss": 0.91013587, + "num_input_tokens_seen": 36762510, + "step": 1730, + "time_per_iteration": 2.8014779090881348 + }, + { + "auxiliary_loss_clip": 0.01155847, + "auxiliary_loss_mlp": 0.01088733, + "balance_loss_clip": 1.03992665, + "balance_loss_mlp": 1.00690818, + "epoch": 0.20814044369626647, + "flos": 23072677781760.0, + "grad_norm": 2.482313428145405, + "language_loss": 0.60385966, + "learning_rate": 3.6763929910856674e-06, + "loss": 0.62630546, + "num_input_tokens_seen": 36780960, + "step": 1731, + "time_per_iteration": 2.8650074005126953 + }, + { + "auxiliary_loss_clip": 0.01154384, + "auxiliary_loss_mlp": 0.01090091, + "balance_loss_clip": 1.040416, + "balance_loss_mlp": 1.00817025, + "epoch": 0.20826068658690555, + "flos": 19608016556160.0, + "grad_norm": 2.2366253976783446, + "language_loss": 0.77395302, + "learning_rate": 3.6759680374623365e-06, + "loss": 0.79639781, + "num_input_tokens_seen": 36798875, + "step": 1732, + "time_per_iteration": 2.8350043296813965 + }, + { + "auxiliary_loss_clip": 0.01166333, + "auxiliary_loss_mlp": 0.01090541, + "balance_loss_clip": 1.04288268, + "balance_loss_mlp": 1.00866818, + "epoch": 0.20838092947754464, + "flos": 25374049142400.0, + "grad_norm": 2.326796842918629, + "language_loss": 0.75601923, + "learning_rate": 3.675542829595986e-06, + "loss": 0.77858794, + "num_input_tokens_seen": 36818540, + "step": 1733, + "time_per_iteration": 2.777932643890381 + }, + { + "auxiliary_loss_clip": 0.01146459, + "auxiliary_loss_mlp": 0.01088187, + "balance_loss_clip": 1.04023182, + "balance_loss_mlp": 1.0063138, + "epoch": 0.20850117236818372, + "flos": 24061729749120.0, + "grad_norm": 1.4302085279163104, + "language_loss": 0.79036635, + "learning_rate": 3.6751173675511213e-06, + "loss": 0.81271279, + "num_input_tokens_seen": 36840585, + "step": 1734, + "time_per_iteration": 2.8572134971618652 + }, + { + "auxiliary_loss_clip": 0.0114743, + "auxiliary_loss_mlp": 0.01089199, + "balance_loss_clip": 1.03937578, + "balance_loss_mlp": 1.00699234, + "epoch": 0.20862141525882283, + "flos": 20077799558400.0, + "grad_norm": 3.334104675885126, + "language_loss": 0.87587094, + "learning_rate": 3.674691651392283e-06, + "loss": 0.89823723, + "num_input_tokens_seen": 36858255, + "step": 1735, + "time_per_iteration": 2.745680093765259 + }, + { + "auxiliary_loss_clip": 0.01145369, + "auxiliary_loss_mlp": 0.01090704, + "balance_loss_clip": 1.03876853, + "balance_loss_mlp": 1.00859272, + "epoch": 0.2087416581494619, + "flos": 39015183237120.0, + "grad_norm": 2.0112380030212536, + "language_loss": 0.75841653, + "learning_rate": 3.674265681184053e-06, + "loss": 0.78077728, + "num_input_tokens_seen": 36881515, + "step": 1736, + "time_per_iteration": 2.9309985637664795 + }, + { + "auxiliary_loss_clip": 0.01146716, + "auxiliary_loss_mlp": 0.0108985, + "balance_loss_clip": 1.03969669, + "balance_loss_mlp": 1.00778627, + "epoch": 0.208861901040101, + "flos": 26101994169600.0, + "grad_norm": 1.7492366715154088, + "language_loss": 0.86428505, + "learning_rate": 3.6738394569910504e-06, + "loss": 0.88665068, + "num_input_tokens_seen": 36902055, + "step": 1737, + "time_per_iteration": 2.7894160747528076 + }, + { + "auxiliary_loss_clip": 0.01147632, + "auxiliary_loss_mlp": 0.01091361, + "balance_loss_clip": 1.03453755, + "balance_loss_mlp": 1.00915468, + "epoch": 0.2089821439307401, + "flos": 28398732675840.0, + "grad_norm": 2.0365318118265057, + "language_loss": 0.82642472, + "learning_rate": 3.6734129788779333e-06, + "loss": 0.84881467, + "num_input_tokens_seen": 36921230, + "step": 1738, + "time_per_iteration": 2.8307669162750244 + }, + { + "auxiliary_loss_clip": 0.01131269, + "auxiliary_loss_mlp": 0.01087814, + "balance_loss_clip": 1.039294, + "balance_loss_mlp": 1.00594139, + "epoch": 0.2091023868213792, + "flos": 21069616872960.0, + "grad_norm": 1.7609119144373289, + "language_loss": 0.90444976, + "learning_rate": 3.6729862469093976e-06, + "loss": 0.92664057, + "num_input_tokens_seen": 36940325, + "step": 1739, + "time_per_iteration": 2.7732300758361816 + }, + { + "auxiliary_loss_clip": 0.01138321, + "auxiliary_loss_mlp": 0.01090078, + "balance_loss_clip": 1.0384109, + "balance_loss_mlp": 1.00806165, + "epoch": 0.20922262971201827, + "flos": 22455481363200.0, + "grad_norm": 17.14988064039814, + "language_loss": 0.82890642, + "learning_rate": 3.6725592611501782e-06, + "loss": 0.85119045, + "num_input_tokens_seen": 36959000, + "step": 1740, + "time_per_iteration": 2.7894558906555176 + }, + { + "auxiliary_loss_clip": 0.01156756, + "auxiliary_loss_mlp": 0.01090411, + "balance_loss_clip": 1.04088378, + "balance_loss_mlp": 1.00830007, + "epoch": 0.20934287260265738, + "flos": 27852244179840.0, + "grad_norm": 2.4067646534856126, + "language_loss": 0.76643109, + "learning_rate": 3.6721320216650496e-06, + "loss": 0.78890276, + "num_input_tokens_seen": 36979615, + "step": 1741, + "time_per_iteration": 3.7123565673828125 + }, + { + "auxiliary_loss_clip": 0.01143296, + "auxiliary_loss_mlp": 0.01089109, + "balance_loss_clip": 1.03758991, + "balance_loss_mlp": 1.00704563, + "epoch": 0.20946311549329646, + "flos": 16435309075200.0, + "grad_norm": 1.6990392849107676, + "language_loss": 0.83719033, + "learning_rate": 3.6717045285188215e-06, + "loss": 0.85951436, + "num_input_tokens_seen": 36997310, + "step": 1742, + "time_per_iteration": 2.7101223468780518 + }, + { + "auxiliary_loss_clip": 0.01122412, + "auxiliary_loss_mlp": 0.01088802, + "balance_loss_clip": 1.0343492, + "balance_loss_mlp": 1.0066427, + "epoch": 0.20958335838393555, + "flos": 22492720788480.0, + "grad_norm": 3.4585983670674416, + "language_loss": 0.87011397, + "learning_rate": 3.671276781776346e-06, + "loss": 0.89222604, + "num_input_tokens_seen": 37015965, + "step": 1743, + "time_per_iteration": 2.855686902999878 + }, + { + "auxiliary_loss_clip": 0.01136369, + "auxiliary_loss_mlp": 0.01088485, + "balance_loss_clip": 1.0408833, + "balance_loss_mlp": 1.00651717, + "epoch": 0.20970360127457463, + "flos": 25224768218880.0, + "grad_norm": 1.9627349459309529, + "language_loss": 0.67574966, + "learning_rate": 3.6708487815025128e-06, + "loss": 0.69799829, + "num_input_tokens_seen": 37036545, + "step": 1744, + "time_per_iteration": 2.845907211303711 + }, + { + "auxiliary_loss_clip": 0.01133908, + "auxiliary_loss_mlp": 0.01089016, + "balance_loss_clip": 1.03701282, + "balance_loss_mlp": 1.00680947, + "epoch": 0.20982384416521374, + "flos": 18479164855680.0, + "grad_norm": 2.0152839607751876, + "language_loss": 0.74039131, + "learning_rate": 3.6704205277622463e-06, + "loss": 0.76262051, + "num_input_tokens_seen": 37054985, + "step": 1745, + "time_per_iteration": 2.862767219543457 + }, + { + "auxiliary_loss_clip": 0.01147597, + "auxiliary_loss_mlp": 0.01090451, + "balance_loss_clip": 1.03993368, + "balance_loss_mlp": 1.00829208, + "epoch": 0.20994408705585282, + "flos": 25373546352000.0, + "grad_norm": 1.833617781866654, + "language_loss": 0.80748087, + "learning_rate": 3.6699920206205146e-06, + "loss": 0.8298614, + "num_input_tokens_seen": 37075725, + "step": 1746, + "time_per_iteration": 3.8824031352996826 + }, + { + "auxiliary_loss_clip": 0.01156812, + "auxiliary_loss_mlp": 0.01088853, + "balance_loss_clip": 1.04081631, + "balance_loss_mlp": 1.00688517, + "epoch": 0.2100643299464919, + "flos": 21320955313920.0, + "grad_norm": 1.68854012293361, + "language_loss": 0.82339847, + "learning_rate": 3.669563260142321e-06, + "loss": 0.84585512, + "num_input_tokens_seen": 37094615, + "step": 1747, + "time_per_iteration": 2.8053760528564453 + }, + { + "auxiliary_loss_clip": 0.01139098, + "auxiliary_loss_mlp": 0.01091303, + "balance_loss_clip": 1.03732193, + "balance_loss_mlp": 1.00919199, + "epoch": 0.21018457283713102, + "flos": 19354379644800.0, + "grad_norm": 2.4279827679665504, + "language_loss": 0.83940756, + "learning_rate": 3.6691342463927083e-06, + "loss": 0.86171162, + "num_input_tokens_seen": 37113610, + "step": 1748, + "time_per_iteration": 2.8794469833374023 + }, + { + "auxiliary_loss_clip": 0.01131158, + "auxiliary_loss_mlp": 0.01091111, + "balance_loss_clip": 1.0386765, + "balance_loss_mlp": 1.00885606, + "epoch": 0.2103048157277701, + "flos": 28330035914880.0, + "grad_norm": 1.5851624159239504, + "language_loss": 0.81768751, + "learning_rate": 3.668704979436758e-06, + "loss": 0.83991015, + "num_input_tokens_seen": 37133705, + "step": 1749, + "time_per_iteration": 4.790251731872559 + }, + { + "auxiliary_loss_clip": 0.01147486, + "auxiliary_loss_mlp": 0.0108923, + "balance_loss_clip": 1.04030442, + "balance_loss_mlp": 1.00716603, + "epoch": 0.21042505861840918, + "flos": 17457290835840.0, + "grad_norm": 1.9894824772953679, + "language_loss": 0.78697139, + "learning_rate": 3.668275459339588e-06, + "loss": 0.80933857, + "num_input_tokens_seen": 37152185, + "step": 1750, + "time_per_iteration": 2.6986560821533203 + }, + { + "auxiliary_loss_clip": 0.01164914, + "auxiliary_loss_mlp": 0.01088116, + "balance_loss_clip": 1.04133224, + "balance_loss_mlp": 1.00619531, + "epoch": 0.21054530150904827, + "flos": 14209817195520.0, + "grad_norm": 2.0885118599600068, + "language_loss": 0.79959422, + "learning_rate": 3.667845686166358e-06, + "loss": 0.82212448, + "num_input_tokens_seen": 37169110, + "step": 1751, + "time_per_iteration": 2.680924892425537 + }, + { + "auxiliary_loss_clip": 0.01132257, + "auxiliary_loss_mlp": 0.01091814, + "balance_loss_clip": 1.03974938, + "balance_loss_mlp": 1.00960779, + "epoch": 0.21066554439968738, + "flos": 18618210403200.0, + "grad_norm": 1.6559596221962454, + "language_loss": 0.85901016, + "learning_rate": 3.6674156599822634e-06, + "loss": 0.88125086, + "num_input_tokens_seen": 37184905, + "step": 1752, + "time_per_iteration": 2.7622082233428955 + }, + { + "auxiliary_loss_clip": 0.01126899, + "auxiliary_loss_mlp": 0.01091237, + "balance_loss_clip": 1.03720808, + "balance_loss_mlp": 1.0089345, + "epoch": 0.21078578729032646, + "flos": 23658883741440.0, + "grad_norm": 1.6707119104035721, + "language_loss": 0.82002467, + "learning_rate": 3.666985380852539e-06, + "loss": 0.84220606, + "num_input_tokens_seen": 37203910, + "step": 1753, + "time_per_iteration": 2.817540168762207 + }, + { + "auxiliary_loss_clip": 0.01141136, + "auxiliary_loss_mlp": 0.01089295, + "balance_loss_clip": 1.03665066, + "balance_loss_mlp": 1.00708795, + "epoch": 0.21090603018096554, + "flos": 29346379240320.0, + "grad_norm": 3.489276459622572, + "language_loss": 0.74382037, + "learning_rate": 3.6665548488424576e-06, + "loss": 0.76612473, + "num_input_tokens_seen": 37222670, + "step": 1754, + "time_per_iteration": 2.8454508781433105 + }, + { + "auxiliary_loss_clip": 0.01165162, + "auxiliary_loss_mlp": 0.01089365, + "balance_loss_clip": 1.04163337, + "balance_loss_mlp": 1.00734925, + "epoch": 0.21102627307160465, + "flos": 23261245205760.0, + "grad_norm": 1.617459582128683, + "language_loss": 0.87991267, + "learning_rate": 3.6661240640173307e-06, + "loss": 0.90245795, + "num_input_tokens_seen": 37244140, + "step": 1755, + "time_per_iteration": 2.677827835083008 + }, + { + "auxiliary_loss_clip": 0.01132099, + "auxiliary_loss_mlp": 0.01080685, + "balance_loss_clip": 1.05181694, + "balance_loss_mlp": 1.00095773, + "epoch": 0.21114651596224374, + "flos": 54633454577280.0, + "grad_norm": 0.8715673778332197, + "language_loss": 0.57952178, + "learning_rate": 3.6656930264425085e-06, + "loss": 0.60164964, + "num_input_tokens_seen": 37308185, + "step": 1756, + "time_per_iteration": 3.403353452682495 + }, + { + "auxiliary_loss_clip": 0.01166017, + "auxiliary_loss_mlp": 0.01087867, + "balance_loss_clip": 1.04238951, + "balance_loss_mlp": 1.00589895, + "epoch": 0.21126675885288282, + "flos": 21543314457600.0, + "grad_norm": 1.87667223192399, + "language_loss": 0.75765836, + "learning_rate": 3.665261736183378e-06, + "loss": 0.78019714, + "num_input_tokens_seen": 37328220, + "step": 1757, + "time_per_iteration": 2.6920292377471924 + }, + { + "auxiliary_loss_clip": 0.01128224, + "auxiliary_loss_mlp": 0.01089486, + "balance_loss_clip": 1.03371716, + "balance_loss_mlp": 1.00732684, + "epoch": 0.2113870017435219, + "flos": 10961876678400.0, + "grad_norm": 2.1564516395422495, + "language_loss": 0.88651061, + "learning_rate": 3.664830193305366e-06, + "loss": 0.90868771, + "num_input_tokens_seen": 37345995, + "step": 1758, + "time_per_iteration": 2.9195032119750977 + }, + { + "auxiliary_loss_clip": 0.01136767, + "auxiliary_loss_mlp": 0.01088538, + "balance_loss_clip": 1.03805673, + "balance_loss_mlp": 1.0065695, + "epoch": 0.211507244634161, + "flos": 16653825463680.0, + "grad_norm": 11.830206075058364, + "language_loss": 0.77196223, + "learning_rate": 3.6643983978739373e-06, + "loss": 0.79421526, + "num_input_tokens_seen": 37362610, + "step": 1759, + "time_per_iteration": 2.830826997756958 + }, + { + "auxiliary_loss_clip": 0.01148306, + "auxiliary_loss_mlp": 0.0109101, + "balance_loss_clip": 1.04261148, + "balance_loss_mlp": 1.00899386, + "epoch": 0.2116274875248001, + "flos": 20954091755520.0, + "grad_norm": 1.7383558947842048, + "language_loss": 0.82062644, + "learning_rate": 3.663966349954596e-06, + "loss": 0.84301955, + "num_input_tokens_seen": 37382790, + "step": 1760, + "time_per_iteration": 2.8259544372558594 + }, + { + "auxiliary_loss_clip": 0.01156599, + "auxiliary_loss_mlp": 0.01080098, + "balance_loss_clip": 1.05364561, + "balance_loss_mlp": 1.00037134, + "epoch": 0.21174773041543918, + "flos": 68196949424640.0, + "grad_norm": 0.7857325068360611, + "language_loss": 0.59672362, + "learning_rate": 3.6635340496128816e-06, + "loss": 0.61909056, + "num_input_tokens_seen": 37439720, + "step": 1761, + "time_per_iteration": 3.2121927738189697 + }, + { + "auxiliary_loss_clip": 0.01128711, + "auxiliary_loss_mlp": 0.01090182, + "balance_loss_clip": 1.03864181, + "balance_loss_mlp": 1.00816607, + "epoch": 0.2118679733060783, + "flos": 20668315150080.0, + "grad_norm": 1.8969992962158813, + "language_loss": 0.93058044, + "learning_rate": 3.6631014969143747e-06, + "loss": 0.9527694, + "num_input_tokens_seen": 37459410, + "step": 1762, + "time_per_iteration": 2.8089754581451416 + }, + { + "auxiliary_loss_clip": 0.01153048, + "auxiliary_loss_mlp": 0.0109099, + "balance_loss_clip": 1.03924549, + "balance_loss_mlp": 1.00897408, + "epoch": 0.21198821619671737, + "flos": 23223431162880.0, + "grad_norm": 1.7026521975909146, + "language_loss": 0.88596177, + "learning_rate": 3.662668691924693e-06, + "loss": 0.9084022, + "num_input_tokens_seen": 37480460, + "step": 1763, + "time_per_iteration": 2.7779769897460938 + }, + { + "auxiliary_loss_clip": 0.01137846, + "auxiliary_loss_mlp": 0.01088385, + "balance_loss_clip": 1.03873253, + "balance_loss_mlp": 1.00632155, + "epoch": 0.21210845908735645, + "flos": 24498547044480.0, + "grad_norm": 3.301644832283485, + "language_loss": 0.70630813, + "learning_rate": 3.6622356347094927e-06, + "loss": 0.72857046, + "num_input_tokens_seen": 37502025, + "step": 1764, + "time_per_iteration": 2.8695013523101807 + }, + { + "auxiliary_loss_clip": 0.01136272, + "auxiliary_loss_mlp": 0.01089717, + "balance_loss_clip": 1.04124415, + "balance_loss_mlp": 1.00741541, + "epoch": 0.21222870197799554, + "flos": 27089789160960.0, + "grad_norm": 1.9273390887075883, + "language_loss": 0.78801286, + "learning_rate": 3.6618023253344684e-06, + "loss": 0.81027281, + "num_input_tokens_seen": 37520885, + "step": 1765, + "time_per_iteration": 2.8429150581359863 + }, + { + "auxiliary_loss_clip": 0.01157053, + "auxiliary_loss_mlp": 0.0108963, + "balance_loss_clip": 1.04116702, + "balance_loss_mlp": 1.00732827, + "epoch": 0.21234894486863465, + "flos": 16873850223360.0, + "grad_norm": 1.5487728633045907, + "language_loss": 0.83757889, + "learning_rate": 3.6613687638653527e-06, + "loss": 0.86004579, + "num_input_tokens_seen": 37539055, + "step": 1766, + "time_per_iteration": 3.681708335876465 + }, + { + "auxiliary_loss_clip": 0.0113972, + "auxiliary_loss_mlp": 0.01089579, + "balance_loss_clip": 1.03786957, + "balance_loss_mlp": 1.00765848, + "epoch": 0.21246918775927373, + "flos": 23474949171840.0, + "grad_norm": 1.6226657392152446, + "language_loss": 0.77753103, + "learning_rate": 3.660934950367916e-06, + "loss": 0.799824, + "num_input_tokens_seen": 37558300, + "step": 1767, + "time_per_iteration": 2.7631804943084717 + }, + { + "auxiliary_loss_clip": 0.01154516, + "auxiliary_loss_mlp": 0.01089005, + "balance_loss_clip": 1.03930652, + "balance_loss_mlp": 1.00698924, + "epoch": 0.21258943064991281, + "flos": 22382295402240.0, + "grad_norm": 1.6275227318618202, + "language_loss": 0.8336094, + "learning_rate": 3.660500884907968e-06, + "loss": 0.85604465, + "num_input_tokens_seen": 37579040, + "step": 1768, + "time_per_iteration": 2.9020652770996094 + }, + { + "auxiliary_loss_clip": 0.01111763, + "auxiliary_loss_mlp": 0.01079871, + "balance_loss_clip": 1.03424656, + "balance_loss_mlp": 1.00014389, + "epoch": 0.21270967354055192, + "flos": 59440168679040.0, + "grad_norm": 0.8232385593726504, + "language_loss": 0.60027975, + "learning_rate": 3.660066567551356e-06, + "loss": 0.62219608, + "num_input_tokens_seen": 37639185, + "step": 1769, + "time_per_iteration": 3.3407013416290283 + }, + { + "auxiliary_loss_clip": 0.01155082, + "auxiliary_loss_mlp": 0.00874106, + "balance_loss_clip": 1.04018414, + "balance_loss_mlp": 1.00002611, + "epoch": 0.212829916431191, + "flos": 21544032729600.0, + "grad_norm": 2.1857300932024, + "language_loss": 0.84275132, + "learning_rate": 3.6596319983639657e-06, + "loss": 0.86304319, + "num_input_tokens_seen": 37657765, + "step": 1770, + "time_per_iteration": 2.775383949279785 + }, + { + "auxiliary_loss_clip": 0.01130103, + "auxiliary_loss_mlp": 0.0087427, + "balance_loss_clip": 1.03657138, + "balance_loss_mlp": 0.9999814, + "epoch": 0.2129501593218301, + "flos": 28987739896320.0, + "grad_norm": 1.6125291166333373, + "language_loss": 0.86284161, + "learning_rate": 3.6591971774117214e-06, + "loss": 0.88288534, + "num_input_tokens_seen": 37680740, + "step": 1771, + "time_per_iteration": 3.8459055423736572 + }, + { + "auxiliary_loss_clip": 0.01155494, + "auxiliary_loss_mlp": 0.01090473, + "balance_loss_clip": 1.03953981, + "balance_loss_mlp": 1.00831378, + "epoch": 0.2130704022124692, + "flos": 18806993308800.0, + "grad_norm": 2.0699242168183964, + "language_loss": 0.8050431, + "learning_rate": 3.6587621047605833e-06, + "loss": 0.82750273, + "num_input_tokens_seen": 37697910, + "step": 1772, + "time_per_iteration": 2.7109928131103516 + }, + { + "auxiliary_loss_clip": 0.01153579, + "auxiliary_loss_mlp": 0.01090028, + "balance_loss_clip": 1.03916407, + "balance_loss_mlp": 1.0080595, + "epoch": 0.21319064510310828, + "flos": 13918150759680.0, + "grad_norm": 1.9724466510589778, + "language_loss": 0.86994481, + "learning_rate": 3.6583267804765542e-06, + "loss": 0.89238095, + "num_input_tokens_seen": 37712245, + "step": 1773, + "time_per_iteration": 2.8473451137542725 + }, + { + "auxiliary_loss_clip": 0.01147779, + "auxiliary_loss_mlp": 0.01088605, + "balance_loss_clip": 1.03947091, + "balance_loss_mlp": 1.00644612, + "epoch": 0.21331088799374737, + "flos": 20959694277120.0, + "grad_norm": 1.955616892203867, + "language_loss": 0.85704738, + "learning_rate": 3.6578912046256702e-06, + "loss": 0.87941122, + "num_input_tokens_seen": 37730765, + "step": 1774, + "time_per_iteration": 3.6798155307769775 + }, + { + "auxiliary_loss_clip": 0.01136732, + "auxiliary_loss_mlp": 0.01089242, + "balance_loss_clip": 1.03861392, + "balance_loss_mlp": 1.00713038, + "epoch": 0.21343113088438645, + "flos": 18624638937600.0, + "grad_norm": 1.924916064089457, + "language_loss": 0.76260287, + "learning_rate": 3.6574553772740083e-06, + "loss": 0.78486264, + "num_input_tokens_seen": 37748695, + "step": 1775, + "time_per_iteration": 3.699734687805176 + }, + { + "auxiliary_loss_clip": 0.01142485, + "auxiliary_loss_mlp": 0.01079641, + "balance_loss_clip": 1.04140449, + "balance_loss_mlp": 0.99991399, + "epoch": 0.21355137377502556, + "flos": 67413128791680.0, + "grad_norm": 0.9261355545946819, + "language_loss": 0.61874056, + "learning_rate": 3.657019298487684e-06, + "loss": 0.64096189, + "num_input_tokens_seen": 37813705, + "step": 1776, + "time_per_iteration": 3.2674639225006104 + }, + { + "auxiliary_loss_clip": 0.011507, + "auxiliary_loss_mlp": 0.00874287, + "balance_loss_clip": 1.04003859, + "balance_loss_mlp": 1.0000577, + "epoch": 0.21367161666566464, + "flos": 34532095697280.0, + "grad_norm": 1.8741090036956307, + "language_loss": 0.83587229, + "learning_rate": 3.6565829683328495e-06, + "loss": 0.85612214, + "num_input_tokens_seen": 37836330, + "step": 1777, + "time_per_iteration": 2.795907497406006 + }, + { + "auxiliary_loss_clip": 0.01145826, + "auxiliary_loss_mlp": 0.01087608, + "balance_loss_clip": 1.03362644, + "balance_loss_mlp": 1.00568783, + "epoch": 0.21379185955630373, + "flos": 18989347680000.0, + "grad_norm": 1.980239583572467, + "language_loss": 0.85980129, + "learning_rate": 3.6561463868756965e-06, + "loss": 0.88213563, + "num_input_tokens_seen": 37855030, + "step": 1778, + "time_per_iteration": 2.653433084487915 + }, + { + "auxiliary_loss_clip": 0.01155181, + "auxiliary_loss_mlp": 0.01089203, + "balance_loss_clip": 1.04133165, + "balance_loss_mlp": 1.00709188, + "epoch": 0.21391210244694284, + "flos": 28218497207040.0, + "grad_norm": 1.5384617171949415, + "language_loss": 0.78016615, + "learning_rate": 3.655709554182452e-06, + "loss": 0.80260998, + "num_input_tokens_seen": 37875370, + "step": 1779, + "time_per_iteration": 2.810157060623169 + }, + { + "auxiliary_loss_clip": 0.01156556, + "auxiliary_loss_mlp": 0.0109006, + "balance_loss_clip": 1.04154563, + "balance_loss_mlp": 1.0081389, + "epoch": 0.21403234533758192, + "flos": 17455064192640.0, + "grad_norm": 1.783195046311138, + "language_loss": 0.84515202, + "learning_rate": 3.6552724703193855e-06, + "loss": 0.86761808, + "num_input_tokens_seen": 37892560, + "step": 1780, + "time_per_iteration": 2.6293792724609375 + }, + { + "auxiliary_loss_clip": 0.01106034, + "auxiliary_loss_mlp": 0.01081335, + "balance_loss_clip": 1.03723264, + "balance_loss_mlp": 1.00160789, + "epoch": 0.214152588228221, + "flos": 51637606686720.0, + "grad_norm": 0.790581375121772, + "language_loss": 0.55945504, + "learning_rate": 3.654835135352801e-06, + "loss": 0.58132875, + "num_input_tokens_seen": 37947370, + "step": 1781, + "time_per_iteration": 3.2579777240753174 + }, + { + "auxiliary_loss_clip": 0.0112733, + "auxiliary_loss_mlp": 0.01087704, + "balance_loss_clip": 1.03615439, + "balance_loss_mlp": 1.00573611, + "epoch": 0.21427283111886009, + "flos": 19496154625920.0, + "grad_norm": 1.8884917045855465, + "language_loss": 0.87445366, + "learning_rate": 3.654397549349043e-06, + "loss": 0.896604, + "num_input_tokens_seen": 37964745, + "step": 1782, + "time_per_iteration": 2.7685623168945312 + }, + { + "auxiliary_loss_clip": 0.011385, + "auxiliary_loss_mlp": 0.01088394, + "balance_loss_clip": 1.0369941, + "balance_loss_mlp": 1.00637794, + "epoch": 0.2143930740094992, + "flos": 20084802710400.0, + "grad_norm": 2.1939060878928096, + "language_loss": 0.75870216, + "learning_rate": 3.653959712374491e-06, + "loss": 0.78097105, + "num_input_tokens_seen": 37982850, + "step": 1783, + "time_per_iteration": 2.7437844276428223 + }, + { + "auxiliary_loss_clip": 0.011201, + "auxiliary_loss_mlp": 0.01088396, + "balance_loss_clip": 1.03067136, + "balance_loss_mlp": 1.00642788, + "epoch": 0.21451331690013828, + "flos": 21798603394560.0, + "grad_norm": 1.5112469796668826, + "language_loss": 0.82856107, + "learning_rate": 3.6535216244955663e-06, + "loss": 0.85064602, + "num_input_tokens_seen": 38002745, + "step": 1784, + "time_per_iteration": 2.8036081790924072 + }, + { + "auxiliary_loss_clip": 0.01141187, + "auxiliary_loss_mlp": 0.01089816, + "balance_loss_clip": 1.03577709, + "balance_loss_mlp": 1.00784731, + "epoch": 0.21463355979077736, + "flos": 32853882412800.0, + "grad_norm": 1.8900873130244373, + "language_loss": 0.71093619, + "learning_rate": 3.653083285778726e-06, + "loss": 0.73324621, + "num_input_tokens_seen": 38024115, + "step": 1785, + "time_per_iteration": 2.913827657699585 + }, + { + "auxiliary_loss_clip": 0.0115284, + "auxiliary_loss_mlp": 0.01088633, + "balance_loss_clip": 1.03750169, + "balance_loss_mlp": 1.00661719, + "epoch": 0.21475380268141647, + "flos": 21543817248000.0, + "grad_norm": 2.243455945159979, + "language_loss": 0.81460071, + "learning_rate": 3.6526446962904653e-06, + "loss": 0.83701545, + "num_input_tokens_seen": 38042830, + "step": 1786, + "time_per_iteration": 2.709930419921875 + }, + { + "auxiliary_loss_clip": 0.01145642, + "auxiliary_loss_mlp": 0.01090456, + "balance_loss_clip": 1.03764701, + "balance_loss_mlp": 1.00844026, + "epoch": 0.21487404557205556, + "flos": 32159082660480.0, + "grad_norm": 1.5193493850115392, + "language_loss": 0.74327826, + "learning_rate": 3.652205856097318e-06, + "loss": 0.76563931, + "num_input_tokens_seen": 38066015, + "step": 1787, + "time_per_iteration": 2.7637243270874023 + }, + { + "auxiliary_loss_clip": 0.01134738, + "auxiliary_loss_mlp": 0.00874203, + "balance_loss_clip": 1.03586614, + "balance_loss_mlp": 1.00010586, + "epoch": 0.21499428846269464, + "flos": 12673091583360.0, + "grad_norm": 1.954696092685629, + "language_loss": 0.79010588, + "learning_rate": 3.651766765265856e-06, + "loss": 0.81019521, + "num_input_tokens_seen": 38083025, + "step": 1788, + "time_per_iteration": 2.7863268852233887 + }, + { + "auxiliary_loss_clip": 0.01139736, + "auxiliary_loss_mlp": 0.01088107, + "balance_loss_clip": 1.03424382, + "balance_loss_mlp": 1.00609064, + "epoch": 0.21511453135333372, + "flos": 23471573293440.0, + "grad_norm": 2.0423645937529256, + "language_loss": 0.80980903, + "learning_rate": 3.65132742386269e-06, + "loss": 0.8320874, + "num_input_tokens_seen": 38098245, + "step": 1789, + "time_per_iteration": 2.7095844745635986 + }, + { + "auxiliary_loss_clip": 0.01163558, + "auxiliary_loss_mlp": 0.0109052, + "balance_loss_clip": 1.04018378, + "balance_loss_mlp": 1.00840831, + "epoch": 0.21523477424397283, + "flos": 26943560893440.0, + "grad_norm": 1.9534950964980964, + "language_loss": 0.84914362, + "learning_rate": 3.6508878319544656e-06, + "loss": 0.87168437, + "num_input_tokens_seen": 38118460, + "step": 1790, + "time_per_iteration": 2.8514044284820557 + }, + { + "auxiliary_loss_clip": 0.01143245, + "auxiliary_loss_mlp": 0.01087448, + "balance_loss_clip": 1.03741169, + "balance_loss_mlp": 1.00552738, + "epoch": 0.21535501713461191, + "flos": 18916161719040.0, + "grad_norm": 2.5120595819381375, + "language_loss": 0.81886917, + "learning_rate": 3.65044798960787e-06, + "loss": 0.84117609, + "num_input_tokens_seen": 38136800, + "step": 1791, + "time_per_iteration": 2.672698736190796 + }, + { + "auxiliary_loss_clip": 0.01129817, + "auxiliary_loss_mlp": 0.01090752, + "balance_loss_clip": 1.03406143, + "balance_loss_mlp": 1.00883162, + "epoch": 0.215475260025251, + "flos": 17895113712000.0, + "grad_norm": 1.7564243170674196, + "language_loss": 0.78272307, + "learning_rate": 3.650007896889627e-06, + "loss": 0.80492878, + "num_input_tokens_seen": 38155380, + "step": 1792, + "time_per_iteration": 2.804506778717041 + }, + { + "auxiliary_loss_clip": 0.01163768, + "auxiliary_loss_mlp": 0.01089614, + "balance_loss_clip": 1.04082143, + "balance_loss_mlp": 1.00764561, + "epoch": 0.2155955029158901, + "flos": 16654292340480.0, + "grad_norm": 1.712355280122188, + "language_loss": 0.80325496, + "learning_rate": 3.6495675538664974e-06, + "loss": 0.82578886, + "num_input_tokens_seen": 38174395, + "step": 1793, + "time_per_iteration": 3.5343177318573 + }, + { + "auxiliary_loss_clip": 0.01145762, + "auxiliary_loss_mlp": 0.01088588, + "balance_loss_clip": 1.03908908, + "balance_loss_mlp": 1.00671458, + "epoch": 0.2157157458065292, + "flos": 23621213352960.0, + "grad_norm": 1.796465283244919, + "language_loss": 0.82622814, + "learning_rate": 3.649126960605282e-06, + "loss": 0.84857166, + "num_input_tokens_seen": 38195380, + "step": 1794, + "time_per_iteration": 2.700434446334839 + }, + { + "auxiliary_loss_clip": 0.01132971, + "auxiliary_loss_mlp": 0.01089873, + "balance_loss_clip": 1.0303309, + "balance_loss_mlp": 1.00790501, + "epoch": 0.21583598869716827, + "flos": 22127078292480.0, + "grad_norm": 2.202517167324758, + "language_loss": 0.83838922, + "learning_rate": 3.6486861171728174e-06, + "loss": 0.86061764, + "num_input_tokens_seen": 38213775, + "step": 1795, + "time_per_iteration": 2.7794649600982666 + }, + { + "auxiliary_loss_clip": 0.01135866, + "auxiliary_loss_mlp": 0.01089507, + "balance_loss_clip": 1.03679502, + "balance_loss_mlp": 1.00749087, + "epoch": 0.21595623158780738, + "flos": 23441229279360.0, + "grad_norm": 2.943221136636496, + "language_loss": 0.78765917, + "learning_rate": 3.6482450236359803e-06, + "loss": 0.80991292, + "num_input_tokens_seen": 38235630, + "step": 1796, + "time_per_iteration": 3.8429877758026123 + }, + { + "auxiliary_loss_clip": 0.01153467, + "auxiliary_loss_mlp": 0.0108854, + "balance_loss_clip": 1.03938723, + "balance_loss_mlp": 1.00676274, + "epoch": 0.21607647447844647, + "flos": 26906501036160.0, + "grad_norm": 2.046401457637248, + "language_loss": 0.7756083, + "learning_rate": 3.647803680061683e-06, + "loss": 0.79802835, + "num_input_tokens_seen": 38256045, + "step": 1797, + "time_per_iteration": 2.704967975616455 + }, + { + "auxiliary_loss_clip": 0.01142432, + "auxiliary_loss_mlp": 0.01088514, + "balance_loss_clip": 1.03726697, + "balance_loss_mlp": 1.00640249, + "epoch": 0.21619671736908555, + "flos": 14495378319360.0, + "grad_norm": 2.3509933236327174, + "language_loss": 0.75053942, + "learning_rate": 3.6473620865168776e-06, + "loss": 0.77284884, + "num_input_tokens_seen": 38272915, + "step": 1798, + "time_per_iteration": 2.79801082611084 + }, + { + "auxiliary_loss_clip": 0.01142324, + "auxiliary_loss_mlp": 0.01089197, + "balance_loss_clip": 1.03814578, + "balance_loss_mlp": 1.0072763, + "epoch": 0.21631696025972463, + "flos": 17931096161280.0, + "grad_norm": 1.9146020606031815, + "language_loss": 0.81925714, + "learning_rate": 3.646920243068554e-06, + "loss": 0.84157234, + "num_input_tokens_seen": 38290810, + "step": 1799, + "time_per_iteration": 2.744462251663208 + }, + { + "auxiliary_loss_clip": 0.01135251, + "auxiliary_loss_mlp": 0.01087952, + "balance_loss_clip": 1.03557694, + "balance_loss_mlp": 1.00603139, + "epoch": 0.21643720315036374, + "flos": 24462385027200.0, + "grad_norm": 1.6615088439190855, + "language_loss": 0.74681365, + "learning_rate": 3.6464781497837384e-06, + "loss": 0.76904571, + "num_input_tokens_seen": 38312785, + "step": 1800, + "time_per_iteration": 4.837147951126099 + }, + { + "auxiliary_loss_clip": 0.01144382, + "auxiliary_loss_mlp": 0.01088412, + "balance_loss_clip": 1.03707194, + "balance_loss_mlp": 1.00634837, + "epoch": 0.21655744604100283, + "flos": 28474432588800.0, + "grad_norm": 1.8048302834275063, + "language_loss": 0.72921842, + "learning_rate": 3.6460358067294965e-06, + "loss": 0.75154638, + "num_input_tokens_seen": 38334015, + "step": 1801, + "time_per_iteration": 2.923213005065918 + }, + { + "auxiliary_loss_clip": 0.01162283, + "auxiliary_loss_mlp": 0.01089183, + "balance_loss_clip": 1.0383451, + "balance_loss_mlp": 1.00692904, + "epoch": 0.2166776889316419, + "flos": 20152960767360.0, + "grad_norm": 3.3380002634645916, + "language_loss": 0.77622634, + "learning_rate": 3.645593213972932e-06, + "loss": 0.79874098, + "num_input_tokens_seen": 38352920, + "step": 1802, + "time_per_iteration": 2.7868082523345947 + }, + { + "auxiliary_loss_clip": 0.0115492, + "auxiliary_loss_mlp": 0.01091635, + "balance_loss_clip": 1.03940272, + "balance_loss_mlp": 1.00957119, + "epoch": 0.21679793182228102, + "flos": 15193482122880.0, + "grad_norm": 1.956994563717673, + "language_loss": 0.79769892, + "learning_rate": 3.6451503715811852e-06, + "loss": 0.8201645, + "num_input_tokens_seen": 38371230, + "step": 1803, + "time_per_iteration": 2.6928322315216064 + }, + { + "auxiliary_loss_clip": 0.0114155, + "auxiliary_loss_mlp": 0.01089577, + "balance_loss_clip": 1.03743577, + "balance_loss_mlp": 1.00789475, + "epoch": 0.2169181747129201, + "flos": 17384464010880.0, + "grad_norm": 2.117311600054973, + "language_loss": 0.80149305, + "learning_rate": 3.6447072796214345e-06, + "loss": 0.82380432, + "num_input_tokens_seen": 38389795, + "step": 1804, + "time_per_iteration": 2.7324841022491455 + }, + { + "auxiliary_loss_clip": 0.01125541, + "auxiliary_loss_mlp": 0.01080925, + "balance_loss_clip": 1.04733026, + "balance_loss_mlp": 1.0011977, + "epoch": 0.21703841760355919, + "flos": 58760955429120.0, + "grad_norm": 0.9221177087618546, + "language_loss": 0.63198698, + "learning_rate": 3.644263938160898e-06, + "loss": 0.6540516, + "num_input_tokens_seen": 38445760, + "step": 1805, + "time_per_iteration": 3.2782018184661865 + }, + { + "auxiliary_loss_clip": 0.01135497, + "auxiliary_loss_mlp": 0.010893, + "balance_loss_clip": 1.03699172, + "balance_loss_mlp": 1.00728416, + "epoch": 0.21715866049419827, + "flos": 22418457419520.0, + "grad_norm": 1.6731170588929873, + "language_loss": 0.72098726, + "learning_rate": 3.6438203472668293e-06, + "loss": 0.74323523, + "num_input_tokens_seen": 38465405, + "step": 1806, + "time_per_iteration": 2.7976291179656982 + }, + { + "auxiliary_loss_clip": 0.01143594, + "auxiliary_loss_mlp": 0.01088777, + "balance_loss_clip": 1.03808975, + "balance_loss_mlp": 1.00704753, + "epoch": 0.21727890338483738, + "flos": 17237732952960.0, + "grad_norm": 2.0097326835350895, + "language_loss": 0.81721765, + "learning_rate": 3.6433765070065206e-06, + "loss": 0.83954138, + "num_input_tokens_seen": 38483195, + "step": 1807, + "time_per_iteration": 2.719951868057251 + }, + { + "auxiliary_loss_clip": 0.01162047, + "auxiliary_loss_mlp": 0.01089691, + "balance_loss_clip": 1.03881657, + "balance_loss_mlp": 1.00762749, + "epoch": 0.21739914627547646, + "flos": 13434792416640.0, + "grad_norm": 2.557383938554813, + "language_loss": 0.87463295, + "learning_rate": 3.6429324174473025e-06, + "loss": 0.89715028, + "num_input_tokens_seen": 38496735, + "step": 1808, + "time_per_iteration": 2.6355221271514893 + }, + { + "auxiliary_loss_clip": 0.01153474, + "auxiliary_loss_mlp": 0.01089393, + "balance_loss_clip": 1.03817642, + "balance_loss_mlp": 1.007568, + "epoch": 0.21751938916611555, + "flos": 20959514709120.0, + "grad_norm": 2.017390876264815, + "language_loss": 0.85195065, + "learning_rate": 3.6424880786565425e-06, + "loss": 0.87437934, + "num_input_tokens_seen": 38512880, + "step": 1809, + "time_per_iteration": 2.716447591781616 + }, + { + "auxiliary_loss_clip": 0.01111927, + "auxiliary_loss_mlp": 0.01090935, + "balance_loss_clip": 1.03209233, + "balance_loss_mlp": 1.00872803, + "epoch": 0.21763963205675466, + "flos": 27599936071680.0, + "grad_norm": 1.9711947031278871, + "language_loss": 0.80350852, + "learning_rate": 3.6420434907016482e-06, + "loss": 0.82553715, + "num_input_tokens_seen": 38532570, + "step": 1810, + "time_per_iteration": 2.930274486541748 + }, + { + "auxiliary_loss_clip": 0.01153127, + "auxiliary_loss_mlp": 0.0109066, + "balance_loss_clip": 1.03891444, + "balance_loss_mlp": 1.00878668, + "epoch": 0.21775987494739374, + "flos": 21430411032960.0, + "grad_norm": 1.5331121268179262, + "language_loss": 0.81046236, + "learning_rate": 3.6415986536500606e-06, + "loss": 0.83290017, + "num_input_tokens_seen": 38550900, + "step": 1811, + "time_per_iteration": 2.6978354454040527 + }, + { + "auxiliary_loss_clip": 0.01117243, + "auxiliary_loss_mlp": 0.01090658, + "balance_loss_clip": 1.03365922, + "balance_loss_mlp": 1.00892782, + "epoch": 0.21788011783803282, + "flos": 18332972501760.0, + "grad_norm": 1.5885666855204816, + "language_loss": 0.80904543, + "learning_rate": 3.641153567569263e-06, + "loss": 0.83112442, + "num_input_tokens_seen": 38569215, + "step": 1812, + "time_per_iteration": 2.7740821838378906 + }, + { + "auxiliary_loss_clip": 0.01150643, + "auxiliary_loss_mlp": 0.01087292, + "balance_loss_clip": 1.03762126, + "balance_loss_mlp": 1.00537157, + "epoch": 0.2180003607286719, + "flos": 30262748037120.0, + "grad_norm": 2.1017839145311408, + "language_loss": 0.95399743, + "learning_rate": 3.640708232526774e-06, + "loss": 0.97637677, + "num_input_tokens_seen": 38587870, + "step": 1813, + "time_per_iteration": 2.7335846424102783 + }, + { + "auxiliary_loss_clip": 0.01112737, + "auxiliary_loss_mlp": 0.01088296, + "balance_loss_clip": 1.03352475, + "balance_loss_mlp": 1.00642312, + "epoch": 0.21812060361931102, + "flos": 25480272637440.0, + "grad_norm": 1.6242855341537878, + "language_loss": 0.78536975, + "learning_rate": 3.6402626485901504e-06, + "loss": 0.80738008, + "num_input_tokens_seen": 38606965, + "step": 1814, + "time_per_iteration": 2.9159934520721436 + }, + { + "auxiliary_loss_clip": 0.01152321, + "auxiliary_loss_mlp": 0.01088237, + "balance_loss_clip": 1.03898573, + "balance_loss_mlp": 1.00660229, + "epoch": 0.2182408465099501, + "flos": 21908166854400.0, + "grad_norm": 1.9171528659505983, + "language_loss": 0.78287697, + "learning_rate": 3.639816815826988e-06, + "loss": 0.80528259, + "num_input_tokens_seen": 38626290, + "step": 1815, + "time_per_iteration": 2.694606065750122 + }, + { + "auxiliary_loss_clip": 0.01141423, + "auxiliary_loss_mlp": 0.01089597, + "balance_loss_clip": 1.03619444, + "balance_loss_mlp": 1.00781918, + "epoch": 0.21836108940058918, + "flos": 23657339456640.0, + "grad_norm": 1.823663555707012, + "language_loss": 0.78037745, + "learning_rate": 3.6393707343049176e-06, + "loss": 0.80268764, + "num_input_tokens_seen": 38646620, + "step": 1816, + "time_per_iteration": 2.67978572845459 + }, + { + "auxiliary_loss_clip": 0.01153264, + "auxiliary_loss_mlp": 0.01087416, + "balance_loss_clip": 1.03852022, + "balance_loss_mlp": 1.0055908, + "epoch": 0.2184813322912283, + "flos": 24681009156480.0, + "grad_norm": 3.0510296792852896, + "language_loss": 0.73343718, + "learning_rate": 3.6389244040916104e-06, + "loss": 0.755844, + "num_input_tokens_seen": 38665695, + "step": 1817, + "time_per_iteration": 2.703657865524292 + }, + { + "auxiliary_loss_clip": 0.01141855, + "auxiliary_loss_mlp": 0.00874204, + "balance_loss_clip": 1.03646588, + "balance_loss_mlp": 1.0001229, + "epoch": 0.21860157518186737, + "flos": 26574650259840.0, + "grad_norm": 2.188574836921483, + "language_loss": 0.79647446, + "learning_rate": 3.6384778252547747e-06, + "loss": 0.81663507, + "num_input_tokens_seen": 38681575, + "step": 1818, + "time_per_iteration": 3.6691112518310547 + }, + { + "auxiliary_loss_clip": 0.01140585, + "auxiliary_loss_mlp": 0.00874207, + "balance_loss_clip": 1.0369885, + "balance_loss_mlp": 1.00013804, + "epoch": 0.21872181807250646, + "flos": 20886292834560.0, + "grad_norm": 2.289260434918263, + "language_loss": 0.77668548, + "learning_rate": 3.638030997862155e-06, + "loss": 0.79683346, + "num_input_tokens_seen": 38700510, + "step": 1819, + "time_per_iteration": 2.676962375640869 + }, + { + "auxiliary_loss_clip": 0.01145721, + "auxiliary_loss_mlp": 0.01080523, + "balance_loss_clip": 1.05269742, + "balance_loss_mlp": 1.0007962, + "epoch": 0.21884206096314554, + "flos": 61209452897280.0, + "grad_norm": 0.7565373231788469, + "language_loss": 0.59458899, + "learning_rate": 3.6375839219815356e-06, + "loss": 0.61685139, + "num_input_tokens_seen": 38758310, + "step": 1820, + "time_per_iteration": 3.2459027767181396 + }, + { + "auxiliary_loss_clip": 0.01163393, + "auxiliary_loss_mlp": 0.0108974, + "balance_loss_clip": 1.04025841, + "balance_loss_mlp": 1.00762856, + "epoch": 0.21896230385378465, + "flos": 23473835850240.0, + "grad_norm": 2.0678443519979686, + "language_loss": 0.82763422, + "learning_rate": 3.6371365976807375e-06, + "loss": 0.85016555, + "num_input_tokens_seen": 38778705, + "step": 1821, + "time_per_iteration": 3.6183924674987793 + }, + { + "auxiliary_loss_clip": 0.01122048, + "auxiliary_loss_mlp": 0.0109079, + "balance_loss_clip": 1.03456116, + "balance_loss_mlp": 1.00867879, + "epoch": 0.21908254674442373, + "flos": 25081915829760.0, + "grad_norm": 2.3400405476226083, + "language_loss": 0.83309126, + "learning_rate": 3.6366890250276185e-06, + "loss": 0.85521966, + "num_input_tokens_seen": 38799660, + "step": 1822, + "time_per_iteration": 2.9028656482696533 + }, + { + "auxiliary_loss_clip": 0.01160445, + "auxiliary_loss_mlp": 0.01087844, + "balance_loss_clip": 1.03746557, + "balance_loss_mlp": 1.00597143, + "epoch": 0.21920278963506282, + "flos": 23513768795520.0, + "grad_norm": 1.9348664315353339, + "language_loss": 0.8946197, + "learning_rate": 3.6362412040900764e-06, + "loss": 0.91710263, + "num_input_tokens_seen": 38819450, + "step": 1823, + "time_per_iteration": 2.7950072288513184 + }, + { + "auxiliary_loss_clip": 0.01153487, + "auxiliary_loss_mlp": 0.01088466, + "balance_loss_clip": 1.03864741, + "balance_loss_mlp": 1.00640249, + "epoch": 0.21932303252570193, + "flos": 29242238734080.0, + "grad_norm": 1.8535693758508167, + "language_loss": 0.80239362, + "learning_rate": 3.635793134936044e-06, + "loss": 0.82481313, + "num_input_tokens_seen": 38840460, + "step": 1824, + "time_per_iteration": 2.748671293258667 + }, + { + "auxiliary_loss_clip": 0.01152509, + "auxiliary_loss_mlp": 0.01087729, + "balance_loss_clip": 1.03835726, + "balance_loss_mlp": 1.00590372, + "epoch": 0.219443275416341, + "flos": 20806857907200.0, + "grad_norm": 1.6912341993218722, + "language_loss": 0.7328831, + "learning_rate": 3.635344817633494e-06, + "loss": 0.7552855, + "num_input_tokens_seen": 38859775, + "step": 1825, + "time_per_iteration": 4.66403865814209 + }, + { + "auxiliary_loss_clip": 0.01148527, + "auxiliary_loss_mlp": 0.01087397, + "balance_loss_clip": 1.03580058, + "balance_loss_mlp": 1.00547624, + "epoch": 0.2195635183069801, + "flos": 14501555458560.0, + "grad_norm": 2.0935970256120906, + "language_loss": 0.74944472, + "learning_rate": 3.634896252250436e-06, + "loss": 0.77180392, + "num_input_tokens_seen": 38876540, + "step": 1826, + "time_per_iteration": 2.68190598487854 + }, + { + "auxiliary_loss_clip": 0.01162665, + "auxiliary_loss_mlp": 0.01090617, + "balance_loss_clip": 1.03896856, + "balance_loss_mlp": 1.00860071, + "epoch": 0.2196837611976192, + "flos": 24243473589120.0, + "grad_norm": 1.8349691984307108, + "language_loss": 0.81981218, + "learning_rate": 3.6344474388549157e-06, + "loss": 0.842345, + "num_input_tokens_seen": 38896195, + "step": 1827, + "time_per_iteration": 2.6998374462127686 + }, + { + "auxiliary_loss_clip": 0.01149378, + "auxiliary_loss_mlp": 0.01090755, + "balance_loss_clip": 1.03591943, + "balance_loss_mlp": 1.00873899, + "epoch": 0.2198040040882583, + "flos": 18074523168000.0, + "grad_norm": 2.1183111562388603, + "language_loss": 0.80218846, + "learning_rate": 3.6339983775150183e-06, + "loss": 0.82458979, + "num_input_tokens_seen": 38912755, + "step": 1828, + "time_per_iteration": 2.6784398555755615 + }, + { + "auxiliary_loss_clip": 0.01150486, + "auxiliary_loss_mlp": 0.01089339, + "balance_loss_clip": 1.03738356, + "balance_loss_mlp": 1.00760913, + "epoch": 0.21992424697889737, + "flos": 17784185535360.0, + "grad_norm": 2.5546286020879614, + "language_loss": 0.84558779, + "learning_rate": 3.6335490682988664e-06, + "loss": 0.86798596, + "num_input_tokens_seen": 38928365, + "step": 1829, + "time_per_iteration": 2.6535348892211914 + }, + { + "auxiliary_loss_clip": 0.01116309, + "auxiliary_loss_mlp": 0.01088774, + "balance_loss_clip": 1.03709853, + "balance_loss_mlp": 1.00675821, + "epoch": 0.22004448986953645, + "flos": 17638495971840.0, + "grad_norm": 2.1043547241878255, + "language_loss": 0.82895148, + "learning_rate": 3.63309951127462e-06, + "loss": 0.85100234, + "num_input_tokens_seen": 38945275, + "step": 1830, + "time_per_iteration": 2.8829312324523926 + }, + { + "auxiliary_loss_clip": 0.01128962, + "auxiliary_loss_mlp": 0.01088192, + "balance_loss_clip": 1.03442514, + "balance_loss_mlp": 1.00631917, + "epoch": 0.22016473276017556, + "flos": 22275533203200.0, + "grad_norm": 2.0821032020613837, + "language_loss": 0.7535736, + "learning_rate": 3.6326497065104757e-06, + "loss": 0.77574515, + "num_input_tokens_seen": 38965740, + "step": 1831, + "time_per_iteration": 2.7622315883636475 + }, + { + "auxiliary_loss_clip": 0.01155683, + "auxiliary_loss_mlp": 0.01089926, + "balance_loss_clip": 1.04049098, + "balance_loss_mlp": 1.00810027, + "epoch": 0.22028497565081465, + "flos": 25556259859200.0, + "grad_norm": 1.9659151630723506, + "language_loss": 0.78182429, + "learning_rate": 3.6321996540746697e-06, + "loss": 0.8042804, + "num_input_tokens_seen": 38984815, + "step": 1832, + "time_per_iteration": 2.749267816543579 + }, + { + "auxiliary_loss_clip": 0.01128136, + "auxiliary_loss_mlp": 0.01088649, + "balance_loss_clip": 1.03560317, + "balance_loss_mlp": 1.00682354, + "epoch": 0.22040521854145373, + "flos": 36247332925440.0, + "grad_norm": 1.7470353459353378, + "language_loss": 0.80255157, + "learning_rate": 3.6317493540354733e-06, + "loss": 0.82471943, + "num_input_tokens_seen": 39008230, + "step": 1833, + "time_per_iteration": 2.851144313812256 + }, + { + "auxiliary_loss_clip": 0.01154245, + "auxiliary_loss_mlp": 0.01091478, + "balance_loss_clip": 1.03861356, + "balance_loss_mlp": 1.00941467, + "epoch": 0.22052546143209284, + "flos": 11838420270720.0, + "grad_norm": 1.9542924651820726, + "language_loss": 0.76906902, + "learning_rate": 3.6312988064611976e-06, + "loss": 0.79152626, + "num_input_tokens_seen": 39026540, + "step": 1834, + "time_per_iteration": 2.749110698699951 + }, + { + "auxiliary_loss_clip": 0.01131615, + "auxiliary_loss_mlp": 0.01087833, + "balance_loss_clip": 1.03467703, + "balance_loss_mlp": 1.00600827, + "epoch": 0.22064570432273192, + "flos": 24209250906240.0, + "grad_norm": 2.026905181576918, + "language_loss": 0.81135923, + "learning_rate": 3.6308480114201896e-06, + "loss": 0.83355367, + "num_input_tokens_seen": 39048460, + "step": 1835, + "time_per_iteration": 2.8586201667785645 + }, + { + "auxiliary_loss_clip": 0.01163007, + "auxiliary_loss_mlp": 0.01088527, + "balance_loss_clip": 1.03992748, + "balance_loss_mlp": 1.00670147, + "epoch": 0.220765947213371, + "flos": 17931347556480.0, + "grad_norm": 1.727788728299984, + "language_loss": 0.76462317, + "learning_rate": 3.630396968980835e-06, + "loss": 0.78713846, + "num_input_tokens_seen": 39066335, + "step": 1836, + "time_per_iteration": 2.7383010387420654 + }, + { + "auxiliary_loss_clip": 0.01141879, + "auxiliary_loss_mlp": 0.01090389, + "balance_loss_clip": 1.03618038, + "balance_loss_mlp": 1.00827789, + "epoch": 0.2208861901040101, + "flos": 26757040544640.0, + "grad_norm": 4.955523165530635, + "language_loss": 0.83966887, + "learning_rate": 3.6299456792115575e-06, + "loss": 0.86199152, + "num_input_tokens_seen": 39087590, + "step": 1837, + "time_per_iteration": 2.8421735763549805 + }, + { + "auxiliary_loss_clip": 0.01099483, + "auxiliary_loss_mlp": 0.01087756, + "balance_loss_clip": 1.03450978, + "balance_loss_mlp": 1.00588346, + "epoch": 0.2210064329946492, + "flos": 17817977255040.0, + "grad_norm": 1.8440837952540858, + "language_loss": 0.81049871, + "learning_rate": 3.629494142180815e-06, + "loss": 0.83237112, + "num_input_tokens_seen": 39106335, + "step": 1838, + "time_per_iteration": 3.070457935333252 + }, + { + "auxiliary_loss_clip": 0.01162541, + "auxiliary_loss_mlp": 0.01088331, + "balance_loss_clip": 1.03990483, + "balance_loss_mlp": 1.00645781, + "epoch": 0.22112667588528828, + "flos": 17967401832960.0, + "grad_norm": 2.36380882740177, + "language_loss": 0.85043377, + "learning_rate": 3.6290423579571075e-06, + "loss": 0.87294245, + "num_input_tokens_seen": 39122875, + "step": 1839, + "time_per_iteration": 2.7457430362701416 + }, + { + "auxiliary_loss_clip": 0.01149054, + "auxiliary_loss_mlp": 0.01089315, + "balance_loss_clip": 1.03581679, + "balance_loss_mlp": 1.00748932, + "epoch": 0.22124691877592736, + "flos": 18369206346240.0, + "grad_norm": 1.5674156503001446, + "language_loss": 0.80277562, + "learning_rate": 3.6285903266089694e-06, + "loss": 0.82515931, + "num_input_tokens_seen": 39142150, + "step": 1840, + "time_per_iteration": 2.727537155151367 + }, + { + "auxiliary_loss_clip": 0.01140425, + "auxiliary_loss_mlp": 0.01088726, + "balance_loss_clip": 1.0360657, + "balance_loss_mlp": 1.00680494, + "epoch": 0.22136716166656648, + "flos": 20813286441600.0, + "grad_norm": 2.801797276149054, + "language_loss": 0.77170539, + "learning_rate": 3.628138048204974e-06, + "loss": 0.79399693, + "num_input_tokens_seen": 39162835, + "step": 1841, + "time_per_iteration": 2.79460072517395 + }, + { + "auxiliary_loss_clip": 0.01113702, + "auxiliary_loss_mlp": 0.01089193, + "balance_loss_clip": 1.03306675, + "balance_loss_mlp": 1.00693882, + "epoch": 0.22148740455720556, + "flos": 17675699483520.0, + "grad_norm": 1.6693901151402195, + "language_loss": 0.76127946, + "learning_rate": 3.6276855228137304e-06, + "loss": 0.78330839, + "num_input_tokens_seen": 39181040, + "step": 1842, + "time_per_iteration": 2.842003583908081 + }, + { + "auxiliary_loss_clip": 0.01162416, + "auxiliary_loss_mlp": 0.00874173, + "balance_loss_clip": 1.03932714, + "balance_loss_mlp": 1.00000811, + "epoch": 0.22160764744784464, + "flos": 21726710323200.0, + "grad_norm": 1.9788355134983642, + "language_loss": 0.81388986, + "learning_rate": 3.6272327505038874e-06, + "loss": 0.83425581, + "num_input_tokens_seen": 39197505, + "step": 1843, + "time_per_iteration": 2.7252588272094727 + }, + { + "auxiliary_loss_clip": 0.01118747, + "auxiliary_loss_mlp": 0.01086417, + "balance_loss_clip": 1.03612399, + "balance_loss_mlp": 1.00468695, + "epoch": 0.22172789033848372, + "flos": 23764712186880.0, + "grad_norm": 1.803433743736755, + "language_loss": 0.78335214, + "learning_rate": 3.626779731344131e-06, + "loss": 0.80540383, + "num_input_tokens_seen": 39217295, + "step": 1844, + "time_per_iteration": 3.867683172225952 + }, + { + "auxiliary_loss_clip": 0.01161758, + "auxiliary_loss_mlp": 0.01089111, + "balance_loss_clip": 1.03874302, + "balance_loss_mlp": 1.00723755, + "epoch": 0.22184813322912283, + "flos": 16982300361600.0, + "grad_norm": 1.9536771099985422, + "language_loss": 0.85276628, + "learning_rate": 3.6263264654031814e-06, + "loss": 0.87527502, + "num_input_tokens_seen": 39234195, + "step": 1845, + "time_per_iteration": 2.8101117610931396 + }, + { + "auxiliary_loss_clip": 0.01130115, + "auxiliary_loss_mlp": 0.01080137, + "balance_loss_clip": 1.04521298, + "balance_loss_mlp": 1.00041032, + "epoch": 0.22196837611976192, + "flos": 61823740314240.0, + "grad_norm": 0.909779415172543, + "language_loss": 0.59204161, + "learning_rate": 3.6258729527498008e-06, + "loss": 0.61414421, + "num_input_tokens_seen": 39295040, + "step": 1846, + "time_per_iteration": 3.4521968364715576 + }, + { + "auxiliary_loss_clip": 0.01142357, + "auxiliary_loss_mlp": 0.01088225, + "balance_loss_clip": 1.0367198, + "balance_loss_mlp": 1.00630414, + "epoch": 0.222088619010401, + "flos": 25558019625600.0, + "grad_norm": 3.7397181079177915, + "language_loss": 0.64592063, + "learning_rate": 3.6254191934527854e-06, + "loss": 0.66822642, + "num_input_tokens_seen": 39314395, + "step": 1847, + "time_per_iteration": 3.958587646484375 + }, + { + "auxiliary_loss_clip": 0.01118289, + "auxiliary_loss_mlp": 0.0108881, + "balance_loss_clip": 1.0307374, + "balance_loss_mlp": 1.00693703, + "epoch": 0.2222088619010401, + "flos": 19318612677120.0, + "grad_norm": 1.8219719198028437, + "language_loss": 0.65020561, + "learning_rate": 3.6249651875809715e-06, + "loss": 0.67227662, + "num_input_tokens_seen": 39334275, + "step": 1848, + "time_per_iteration": 2.793973922729492 + }, + { + "auxiliary_loss_clip": 0.01134098, + "auxiliary_loss_mlp": 0.01090386, + "balance_loss_clip": 1.03386056, + "balance_loss_mlp": 1.00841784, + "epoch": 0.2223291047916792, + "flos": 19099342103040.0, + "grad_norm": 1.917466178288484, + "language_loss": 0.88875508, + "learning_rate": 3.62451093520323e-06, + "loss": 0.91100001, + "num_input_tokens_seen": 39352180, + "step": 1849, + "time_per_iteration": 2.760481595993042 + }, + { + "auxiliary_loss_clip": 0.01116037, + "auxiliary_loss_mlp": 0.01088581, + "balance_loss_clip": 1.03104186, + "balance_loss_mlp": 1.0066123, + "epoch": 0.22244934768231828, + "flos": 20850418126080.0, + "grad_norm": 2.751524113599278, + "language_loss": 0.90302384, + "learning_rate": 3.6240564363884714e-06, + "loss": 0.92507005, + "num_input_tokens_seen": 39372125, + "step": 1850, + "time_per_iteration": 3.8143320083618164 + }, + { + "auxiliary_loss_clip": 0.01151632, + "auxiliary_loss_mlp": 0.01088683, + "balance_loss_clip": 1.03677273, + "balance_loss_mlp": 1.00657141, + "epoch": 0.2225695905729574, + "flos": 15632921111040.0, + "grad_norm": 1.9617211674318036, + "language_loss": 0.70125276, + "learning_rate": 3.623601691205643e-06, + "loss": 0.72365594, + "num_input_tokens_seen": 39391200, + "step": 1851, + "time_per_iteration": 3.6557254791259766 + }, + { + "auxiliary_loss_clip": 0.01150996, + "auxiliary_loss_mlp": 0.01087422, + "balance_loss_clip": 1.03700709, + "balance_loss_mlp": 1.00540638, + "epoch": 0.22268983346359647, + "flos": 25373582265600.0, + "grad_norm": 1.8818880986334814, + "language_loss": 0.81426316, + "learning_rate": 3.623146699723729e-06, + "loss": 0.83664733, + "num_input_tokens_seen": 39410660, + "step": 1852, + "time_per_iteration": 2.7751431465148926 + }, + { + "auxiliary_loss_clip": 0.01140633, + "auxiliary_loss_mlp": 0.01089088, + "balance_loss_clip": 1.03772867, + "balance_loss_mlp": 1.00735784, + "epoch": 0.22281007635423555, + "flos": 13261452359040.0, + "grad_norm": 2.0597843990266775, + "language_loss": 0.77428937, + "learning_rate": 3.6226914620117507e-06, + "loss": 0.79658657, + "num_input_tokens_seen": 39429280, + "step": 1853, + "time_per_iteration": 2.7104313373565674 + }, + { + "auxiliary_loss_clip": 0.01128599, + "auxiliary_loss_mlp": 0.01088248, + "balance_loss_clip": 1.03550601, + "balance_loss_mlp": 1.00637484, + "epoch": 0.22293031924487464, + "flos": 15340536403200.0, + "grad_norm": 2.335710966069249, + "language_loss": 0.81012827, + "learning_rate": 3.622235978138768e-06, + "loss": 0.83229667, + "num_input_tokens_seen": 39446905, + "step": 1854, + "time_per_iteration": 2.9044830799102783 + }, + { + "auxiliary_loss_clip": 0.0114747, + "auxiliary_loss_mlp": 0.01090097, + "balance_loss_clip": 1.03705847, + "balance_loss_mlp": 1.00817633, + "epoch": 0.22305056213551375, + "flos": 22564649773440.0, + "grad_norm": 1.776377885083579, + "language_loss": 0.81197727, + "learning_rate": 3.621780248173877e-06, + "loss": 0.83435297, + "num_input_tokens_seen": 39465105, + "step": 1855, + "time_per_iteration": 2.7128164768218994 + }, + { + "auxiliary_loss_clip": 0.0115114, + "auxiliary_loss_mlp": 0.01080716, + "balance_loss_clip": 1.04968393, + "balance_loss_mlp": 1.00098884, + "epoch": 0.22317080502615283, + "flos": 64880419887360.0, + "grad_norm": 0.8276114184865527, + "language_loss": 0.61115205, + "learning_rate": 3.6213242721862125e-06, + "loss": 0.63347065, + "num_input_tokens_seen": 39523560, + "step": 1856, + "time_per_iteration": 3.300943374633789 + }, + { + "auxiliary_loss_clip": 0.01141748, + "auxiliary_loss_mlp": 0.01088735, + "balance_loss_clip": 1.03730059, + "balance_loss_mlp": 1.00700569, + "epoch": 0.2232910479167919, + "flos": 25775997310080.0, + "grad_norm": 1.8160887573063353, + "language_loss": 0.75464278, + "learning_rate": 3.620868050244945e-06, + "loss": 0.77694762, + "num_input_tokens_seen": 39544040, + "step": 1857, + "time_per_iteration": 2.8979499340057373 + }, + { + "auxiliary_loss_clip": 0.01141039, + "auxiliary_loss_mlp": 0.01088904, + "balance_loss_clip": 1.03554142, + "balance_loss_mlp": 1.00674474, + "epoch": 0.22341129080743102, + "flos": 23251799928960.0, + "grad_norm": 2.0511995026928145, + "language_loss": 0.77487397, + "learning_rate": 3.6204115824192817e-06, + "loss": 0.79717338, + "num_input_tokens_seen": 39561515, + "step": 1858, + "time_per_iteration": 2.7440059185028076 + }, + { + "auxiliary_loss_clip": 0.01143102, + "auxiliary_loss_mlp": 0.01088644, + "balance_loss_clip": 1.03762579, + "balance_loss_mlp": 1.00658011, + "epoch": 0.2235315336980701, + "flos": 21214552250880.0, + "grad_norm": 2.326924073121999, + "language_loss": 0.76644111, + "learning_rate": 3.619954868778471e-06, + "loss": 0.78875864, + "num_input_tokens_seen": 39578210, + "step": 1859, + "time_per_iteration": 2.7679896354675293 + }, + { + "auxiliary_loss_clip": 0.01143933, + "auxiliary_loss_mlp": 0.01087602, + "balance_loss_clip": 1.03682947, + "balance_loss_mlp": 1.00591993, + "epoch": 0.2236517765887092, + "flos": 19901945548800.0, + "grad_norm": 1.9779455431615753, + "language_loss": 0.83151293, + "learning_rate": 3.6194979093917944e-06, + "loss": 0.85382831, + "num_input_tokens_seen": 39597625, + "step": 1860, + "time_per_iteration": 2.7809693813323975 + }, + { + "auxiliary_loss_clip": 0.01138273, + "auxiliary_loss_mlp": 0.01087432, + "balance_loss_clip": 1.03411067, + "balance_loss_mlp": 1.00579715, + "epoch": 0.22377201947934827, + "flos": 23214847812480.0, + "grad_norm": 1.951515907739102, + "language_loss": 0.86952907, + "learning_rate": 3.6190407043285724e-06, + "loss": 0.8917861, + "num_input_tokens_seen": 39615360, + "step": 1861, + "time_per_iteration": 2.7597899436950684 + }, + { + "auxiliary_loss_clip": 0.01160129, + "auxiliary_loss_mlp": 0.01088871, + "balance_loss_clip": 1.03742409, + "balance_loss_mlp": 1.00690222, + "epoch": 0.22389226236998738, + "flos": 26794244056320.0, + "grad_norm": 1.7680110393032051, + "language_loss": 0.75862968, + "learning_rate": 3.618583253658163e-06, + "loss": 0.7811197, + "num_input_tokens_seen": 39635460, + "step": 1862, + "time_per_iteration": 2.6825461387634277 + }, + { + "auxiliary_loss_clip": 0.01114398, + "auxiliary_loss_mlp": 0.00874168, + "balance_loss_clip": 1.02906346, + "balance_loss_mlp": 1.00004172, + "epoch": 0.22401250526062647, + "flos": 24170359455360.0, + "grad_norm": 2.06061835073897, + "language_loss": 0.86550546, + "learning_rate": 3.618125557449961e-06, + "loss": 0.88539118, + "num_input_tokens_seen": 39653515, + "step": 1863, + "time_per_iteration": 2.8549137115478516 + }, + { + "auxiliary_loss_clip": 0.0114814, + "auxiliary_loss_mlp": 0.01087988, + "balance_loss_clip": 1.03514993, + "balance_loss_mlp": 1.00625777, + "epoch": 0.22413274815126555, + "flos": 16759761649920.0, + "grad_norm": 2.4257865424021827, + "language_loss": 0.83033818, + "learning_rate": 3.6176676157733983e-06, + "loss": 0.85269946, + "num_input_tokens_seen": 39668525, + "step": 1864, + "time_per_iteration": 2.7233216762542725 + }, + { + "auxiliary_loss_clip": 0.01130376, + "auxiliary_loss_mlp": 0.01088892, + "balance_loss_clip": 1.03385496, + "balance_loss_mlp": 1.00687575, + "epoch": 0.22425299104190466, + "flos": 21360205900800.0, + "grad_norm": 1.9326669975201667, + "language_loss": 0.76323205, + "learning_rate": 3.6172094286979443e-06, + "loss": 0.78542471, + "num_input_tokens_seen": 39685895, + "step": 1865, + "time_per_iteration": 2.8673484325408936 + }, + { + "auxiliary_loss_clip": 0.01140162, + "auxiliary_loss_mlp": 0.01089605, + "balance_loss_clip": 1.03495014, + "balance_loss_mlp": 1.00777924, + "epoch": 0.22437323393254374, + "flos": 32165547108480.0, + "grad_norm": 1.3864837371554488, + "language_loss": 0.81345665, + "learning_rate": 3.6167509962931064e-06, + "loss": 0.83575422, + "num_input_tokens_seen": 39711595, + "step": 1866, + "time_per_iteration": 2.969538927078247 + }, + { + "auxiliary_loss_clip": 0.01121975, + "auxiliary_loss_mlp": 0.01090818, + "balance_loss_clip": 1.03422201, + "balance_loss_mlp": 1.00870681, + "epoch": 0.22449347682318282, + "flos": 18002809664640.0, + "grad_norm": 2.740933292901764, + "language_loss": 0.77373368, + "learning_rate": 3.6162923186284276e-06, + "loss": 0.79586166, + "num_input_tokens_seen": 39727555, + "step": 1867, + "time_per_iteration": 2.810713052749634 + }, + { + "auxiliary_loss_clip": 0.01143344, + "auxiliary_loss_mlp": 0.01090686, + "balance_loss_clip": 1.037287, + "balance_loss_mlp": 1.00876558, + "epoch": 0.2246137197138219, + "flos": 18697286194560.0, + "grad_norm": 1.9277313200557873, + "language_loss": 0.85592538, + "learning_rate": 3.6158333957734888e-06, + "loss": 0.87826568, + "num_input_tokens_seen": 39746145, + "step": 1868, + "time_per_iteration": 2.7702932357788086 + }, + { + "auxiliary_loss_clip": 0.01132046, + "auxiliary_loss_mlp": 0.01088943, + "balance_loss_clip": 1.03473663, + "balance_loss_mlp": 1.00721335, + "epoch": 0.22473396260446102, + "flos": 15590653781760.0, + "grad_norm": 1.9702816046293141, + "language_loss": 0.82723784, + "learning_rate": 3.6153742277979088e-06, + "loss": 0.84944773, + "num_input_tokens_seen": 39763575, + "step": 1869, + "time_per_iteration": 3.7625513076782227 + }, + { + "auxiliary_loss_clip": 0.0113929, + "auxiliary_loss_mlp": 0.01090751, + "balance_loss_clip": 1.03428137, + "balance_loss_mlp": 1.00902152, + "epoch": 0.2248542054951001, + "flos": 14465501182080.0, + "grad_norm": 2.168210451590677, + "language_loss": 0.78513139, + "learning_rate": 3.6149148147713434e-06, + "loss": 0.80743182, + "num_input_tokens_seen": 39781810, + "step": 1870, + "time_per_iteration": 2.715607166290283 + }, + { + "auxiliary_loss_clip": 0.01152437, + "auxiliary_loss_mlp": 0.01089648, + "balance_loss_clip": 1.03828645, + "balance_loss_mlp": 1.00796604, + "epoch": 0.22497444838573918, + "flos": 19243882431360.0, + "grad_norm": 1.9999813482323066, + "language_loss": 0.86269808, + "learning_rate": 3.614455156763484e-06, + "loss": 0.8851189, + "num_input_tokens_seen": 39800115, + "step": 1871, + "time_per_iteration": 2.6869637966156006 + }, + { + "auxiliary_loss_clip": 0.01123132, + "auxiliary_loss_mlp": 0.01091373, + "balance_loss_clip": 1.03403544, + "balance_loss_mlp": 1.00926161, + "epoch": 0.2250946912763783, + "flos": 16910299549440.0, + "grad_norm": 1.9266010585651316, + "language_loss": 0.71481985, + "learning_rate": 3.613995253844061e-06, + "loss": 0.73696494, + "num_input_tokens_seen": 39817795, + "step": 1872, + "time_per_iteration": 3.7822039127349854 + }, + { + "auxiliary_loss_clip": 0.01142577, + "auxiliary_loss_mlp": 0.01087054, + "balance_loss_clip": 1.03150749, + "balance_loss_mlp": 1.00513315, + "epoch": 0.22521493416701738, + "flos": 24681368292480.0, + "grad_norm": 1.7987659653484238, + "language_loss": 0.81157857, + "learning_rate": 3.6135351060828414e-06, + "loss": 0.83387494, + "num_input_tokens_seen": 39838270, + "step": 1873, + "time_per_iteration": 2.6962080001831055 + }, + { + "auxiliary_loss_clip": 0.01161555, + "auxiliary_loss_mlp": 0.01088408, + "balance_loss_clip": 1.03886604, + "balance_loss_mlp": 1.00634432, + "epoch": 0.22533517705765646, + "flos": 17821963664640.0, + "grad_norm": 1.8701751497548216, + "language_loss": 0.69212258, + "learning_rate": 3.6130747135496285e-06, + "loss": 0.71462214, + "num_input_tokens_seen": 39857270, + "step": 1874, + "time_per_iteration": 2.745553493499756 + }, + { + "auxiliary_loss_clip": 0.01158709, + "auxiliary_loss_mlp": 0.01088742, + "balance_loss_clip": 1.03660727, + "balance_loss_mlp": 1.00677335, + "epoch": 0.22545541994829554, + "flos": 33691390899840.0, + "grad_norm": 1.7798667810031297, + "language_loss": 0.65920877, + "learning_rate": 3.6126140763142646e-06, + "loss": 0.68168324, + "num_input_tokens_seen": 39882300, + "step": 1875, + "time_per_iteration": 2.797259569168091 + }, + { + "auxiliary_loss_clip": 0.01159991, + "auxiliary_loss_mlp": 0.01089275, + "balance_loss_clip": 1.03732371, + "balance_loss_mlp": 1.00725913, + "epoch": 0.22557566283893465, + "flos": 19171594310400.0, + "grad_norm": 2.8628245407222823, + "language_loss": 0.85535896, + "learning_rate": 3.6121531944466275e-06, + "loss": 0.87785161, + "num_input_tokens_seen": 39899625, + "step": 1876, + "time_per_iteration": 4.6666178703308105 + }, + { + "auxiliary_loss_clip": 0.01149609, + "auxiliary_loss_mlp": 0.01087963, + "balance_loss_clip": 1.03678417, + "balance_loss_mlp": 1.00613821, + "epoch": 0.22569590572957374, + "flos": 20773281669120.0, + "grad_norm": 2.0780978590952444, + "language_loss": 0.78185821, + "learning_rate": 3.611692068016633e-06, + "loss": 0.80423391, + "num_input_tokens_seen": 39915955, + "step": 1877, + "time_per_iteration": 2.800105094909668 + }, + { + "auxiliary_loss_clip": 0.01132523, + "auxiliary_loss_mlp": 0.01090322, + "balance_loss_clip": 1.03490019, + "balance_loss_mlp": 1.00821042, + "epoch": 0.22581614862021282, + "flos": 18442715529600.0, + "grad_norm": 2.0762986452285705, + "language_loss": 0.75034422, + "learning_rate": 3.611230697094233e-06, + "loss": 0.7725727, + "num_input_tokens_seen": 39932655, + "step": 1878, + "time_per_iteration": 2.8068113327026367 + }, + { + "auxiliary_loss_clip": 0.01142669, + "auxiliary_loss_mlp": 0.01090177, + "balance_loss_clip": 1.03738856, + "balance_loss_mlp": 1.0083518, + "epoch": 0.22593639151085193, + "flos": 20048389297920.0, + "grad_norm": 1.6861424628359074, + "language_loss": 0.87209284, + "learning_rate": 3.6107690817494173e-06, + "loss": 0.89442122, + "num_input_tokens_seen": 39952875, + "step": 1879, + "time_per_iteration": 2.8677916526794434 + }, + { + "auxiliary_loss_clip": 0.01120919, + "auxiliary_loss_mlp": 0.01088321, + "balance_loss_clip": 1.03257227, + "balance_loss_mlp": 1.00630498, + "epoch": 0.226056634401491, + "flos": 13115116350720.0, + "grad_norm": 2.2681862651792972, + "language_loss": 0.7078281, + "learning_rate": 3.6103072220522117e-06, + "loss": 0.72992051, + "num_input_tokens_seen": 39968405, + "step": 1880, + "time_per_iteration": 2.797518730163574 + }, + { + "auxiliary_loss_clip": 0.01130692, + "auxiliary_loss_mlp": 0.01089609, + "balance_loss_clip": 1.03473747, + "balance_loss_mlp": 1.00768816, + "epoch": 0.2261768772921301, + "flos": 18988378012800.0, + "grad_norm": 1.8013891127167836, + "language_loss": 0.91901946, + "learning_rate": 3.609845118072682e-06, + "loss": 0.94122249, + "num_input_tokens_seen": 39987075, + "step": 1881, + "time_per_iteration": 2.8209753036499023 + }, + { + "auxiliary_loss_clip": 0.01151041, + "auxiliary_loss_mlp": 0.00874234, + "balance_loss_clip": 1.03701568, + "balance_loss_mlp": 1.00005412, + "epoch": 0.2262971201827692, + "flos": 19974054101760.0, + "grad_norm": 1.763923860219382, + "language_loss": 0.7970289, + "learning_rate": 3.6093827698809276e-06, + "loss": 0.81728172, + "num_input_tokens_seen": 40006175, + "step": 1882, + "time_per_iteration": 2.7907915115356445 + }, + { + "auxiliary_loss_clip": 0.01151454, + "auxiliary_loss_mlp": 0.01088007, + "balance_loss_clip": 1.03717399, + "balance_loss_mlp": 1.00618184, + "epoch": 0.2264173630734083, + "flos": 16654543735680.0, + "grad_norm": 2.6371901000744304, + "language_loss": 0.84996641, + "learning_rate": 3.6089201775470864e-06, + "loss": 0.87236106, + "num_input_tokens_seen": 40021630, + "step": 1883, + "time_per_iteration": 0.029833555221557617 + }, + { + "auxiliary_loss_clip": 0.0112744, + "auxiliary_loss_mlp": 0.01089215, + "balance_loss_clip": 1.03271353, + "balance_loss_mlp": 1.00743747, + "epoch": 0.22653760596404737, + "flos": 24389809597440.0, + "grad_norm": 1.3136829957516818, + "language_loss": 0.77509141, + "learning_rate": 3.6084573411413334e-06, + "loss": 0.79725796, + "num_input_tokens_seen": 40041025, + "step": 1884, + "time_per_iteration": 2.901547908782959 + }, + { + "auxiliary_loss_clip": 0.01132541, + "auxiliary_loss_mlp": 0.0108988, + "balance_loss_clip": 1.03548479, + "balance_loss_mlp": 1.00791168, + "epoch": 0.22665784885468646, + "flos": 18332541538560.0, + "grad_norm": 1.9653938907923136, + "language_loss": 0.80972815, + "learning_rate": 3.607994260733881e-06, + "loss": 0.83195233, + "num_input_tokens_seen": 40060265, + "step": 1885, + "time_per_iteration": 2.800764799118042 + }, + { + "auxiliary_loss_clip": 0.01150181, + "auxiliary_loss_mlp": 0.01088862, + "balance_loss_clip": 1.03645372, + "balance_loss_mlp": 1.0069418, + "epoch": 0.22677809174532557, + "flos": 24058102475520.0, + "grad_norm": 1.5969592535915262, + "language_loss": 0.74607301, + "learning_rate": 3.6075309363949776e-06, + "loss": 0.76846343, + "num_input_tokens_seen": 40079435, + "step": 1886, + "time_per_iteration": 2.769045829772949 + }, + { + "auxiliary_loss_clip": 0.0116085, + "auxiliary_loss_mlp": 0.01088139, + "balance_loss_clip": 1.038517, + "balance_loss_mlp": 1.00640893, + "epoch": 0.22689833463596465, + "flos": 20374242503040.0, + "grad_norm": 1.8541237271240607, + "language_loss": 0.81361169, + "learning_rate": 3.6070673681949094e-06, + "loss": 0.83610159, + "num_input_tokens_seen": 40097800, + "step": 1887, + "time_per_iteration": 2.7260918617248535 + }, + { + "auxiliary_loss_clip": 0.01141049, + "auxiliary_loss_mlp": 0.00874035, + "balance_loss_clip": 1.03672552, + "balance_loss_mlp": 1.00001764, + "epoch": 0.22701857752660373, + "flos": 30120398438400.0, + "grad_norm": 1.6111838713228595, + "language_loss": 0.81303501, + "learning_rate": 3.606603556203999e-06, + "loss": 0.83318591, + "num_input_tokens_seen": 40122745, + "step": 1888, + "time_per_iteration": 2.8825740814208984 + }, + { + "auxiliary_loss_clip": 0.01151351, + "auxiliary_loss_mlp": 0.01087696, + "balance_loss_clip": 1.03720212, + "balance_loss_mlp": 1.00582325, + "epoch": 0.22713882041724284, + "flos": 22492182084480.0, + "grad_norm": 1.9553292820086705, + "language_loss": 0.83780009, + "learning_rate": 3.6061395004926066e-06, + "loss": 0.86019063, + "num_input_tokens_seen": 40141680, + "step": 1889, + "time_per_iteration": 2.763883590698242 + }, + { + "auxiliary_loss_clip": 0.01159471, + "auxiliary_loss_mlp": 0.01086763, + "balance_loss_clip": 1.03730905, + "balance_loss_mlp": 1.00479496, + "epoch": 0.22725906330788193, + "flos": 20521548178560.0, + "grad_norm": 2.5014442414041755, + "language_loss": 0.84940284, + "learning_rate": 3.605675201131129e-06, + "loss": 0.87186515, + "num_input_tokens_seen": 40160140, + "step": 1890, + "time_per_iteration": 2.699033498764038 + }, + { + "auxiliary_loss_clip": 0.01146391, + "auxiliary_loss_mlp": 0.01089853, + "balance_loss_clip": 1.03836644, + "balance_loss_mlp": 1.00797963, + "epoch": 0.227379306198521, + "flos": 18989922297600.0, + "grad_norm": 2.5085636505733198, + "language_loss": 0.80009425, + "learning_rate": 3.60521065819e-06, + "loss": 0.82245666, + "num_input_tokens_seen": 40177450, + "step": 1891, + "time_per_iteration": 2.760202407836914 + }, + { + "auxiliary_loss_clip": 0.01141978, + "auxiliary_loss_mlp": 0.01088508, + "balance_loss_clip": 1.03606319, + "balance_loss_mlp": 1.00668299, + "epoch": 0.2274995490891601, + "flos": 21798351999360.0, + "grad_norm": 1.818052874334154, + "language_loss": 0.87601334, + "learning_rate": 3.60474587173969e-06, + "loss": 0.89831817, + "num_input_tokens_seen": 40195935, + "step": 1892, + "time_per_iteration": 2.794614553451538 + }, + { + "auxiliary_loss_clip": 0.01149912, + "auxiliary_loss_mlp": 0.01089654, + "balance_loss_clip": 1.0374217, + "balance_loss_mlp": 1.0077337, + "epoch": 0.2276197919797992, + "flos": 19058654972160.0, + "grad_norm": 2.1885557402880798, + "language_loss": 0.84194148, + "learning_rate": 3.6042808418507084e-06, + "loss": 0.86433709, + "num_input_tokens_seen": 40213620, + "step": 1893, + "time_per_iteration": 2.747049570083618 + }, + { + "auxiliary_loss_clip": 0.01147926, + "auxiliary_loss_mlp": 0.01091429, + "balance_loss_clip": 1.03594947, + "balance_loss_mlp": 1.00941288, + "epoch": 0.22774003487043828, + "flos": 18806777827200.0, + "grad_norm": 2.013164710435614, + "language_loss": 0.76773185, + "learning_rate": 3.6038155685935976e-06, + "loss": 0.79012543, + "num_input_tokens_seen": 40230190, + "step": 1894, + "time_per_iteration": 2.7082877159118652 + }, + { + "auxiliary_loss_clip": 0.0114807, + "auxiliary_loss_mlp": 0.01089557, + "balance_loss_clip": 1.03553653, + "balance_loss_mlp": 1.00744557, + "epoch": 0.22786027776107737, + "flos": 23002544476800.0, + "grad_norm": 1.9295732855724477, + "language_loss": 0.70747018, + "learning_rate": 3.6033500520389404e-06, + "loss": 0.72984648, + "num_input_tokens_seen": 40246860, + "step": 1895, + "time_per_iteration": 3.6603903770446777 + }, + { + "auxiliary_loss_clip": 0.01119771, + "auxiliary_loss_mlp": 0.01080159, + "balance_loss_clip": 1.0426085, + "balance_loss_mlp": 1.00043142, + "epoch": 0.22798052065171648, + "flos": 66706872600960.0, + "grad_norm": 0.7967704562497615, + "language_loss": 0.64817148, + "learning_rate": 3.6028842922573553e-06, + "loss": 0.67017078, + "num_input_tokens_seen": 40311005, + "step": 1896, + "time_per_iteration": 3.553037643432617 + }, + { + "auxiliary_loss_clip": 0.01128086, + "auxiliary_loss_mlp": 0.00873518, + "balance_loss_clip": 1.04271245, + "balance_loss_mlp": 1.00005877, + "epoch": 0.22810076354235556, + "flos": 62080896758400.0, + "grad_norm": 0.8712808124545702, + "language_loss": 0.63015068, + "learning_rate": 3.602418289319497e-06, + "loss": 0.65016663, + "num_input_tokens_seen": 40369560, + "step": 1897, + "time_per_iteration": 4.44147801399231 + }, + { + "auxiliary_loss_clip": 0.01123712, + "auxiliary_loss_mlp": 0.01091046, + "balance_loss_clip": 1.03504586, + "balance_loss_mlp": 1.00912559, + "epoch": 0.22822100643299464, + "flos": 23876358635520.0, + "grad_norm": 1.7097985706652405, + "language_loss": 0.73454303, + "learning_rate": 3.601952043296059e-06, + "loss": 0.75669062, + "num_input_tokens_seen": 40389555, + "step": 1898, + "time_per_iteration": 2.9491302967071533 + }, + { + "auxiliary_loss_clip": 0.01136977, + "auxiliary_loss_mlp": 0.010883, + "balance_loss_clip": 1.03777635, + "balance_loss_mlp": 1.00628388, + "epoch": 0.22834124932363373, + "flos": 20991331180800.0, + "grad_norm": 1.8362818192998203, + "language_loss": 0.80368823, + "learning_rate": 3.6014855542577696e-06, + "loss": 0.82594109, + "num_input_tokens_seen": 40406765, + "step": 1899, + "time_per_iteration": 2.8017425537109375 + }, + { + "auxiliary_loss_clip": 0.01143437, + "auxiliary_loss_mlp": 0.01089701, + "balance_loss_clip": 1.03776956, + "balance_loss_mlp": 1.00778043, + "epoch": 0.22846149221427284, + "flos": 24901572620160.0, + "grad_norm": 2.0251284398977663, + "language_loss": 0.84218669, + "learning_rate": 3.6010188222753943e-06, + "loss": 0.86451805, + "num_input_tokens_seen": 40427535, + "step": 1900, + "time_per_iteration": 2.89829683303833 + }, + { + "auxiliary_loss_clip": 0.01131664, + "auxiliary_loss_mlp": 0.01080252, + "balance_loss_clip": 1.03894997, + "balance_loss_mlp": 1.00052488, + "epoch": 0.22858173510491192, + "flos": 56132294319360.0, + "grad_norm": 0.9028798569199622, + "language_loss": 0.64195108, + "learning_rate": 3.6005518474197372e-06, + "loss": 0.66407025, + "num_input_tokens_seen": 40479580, + "step": 1901, + "time_per_iteration": 4.216080188751221 + }, + { + "auxiliary_loss_clip": 0.01147882, + "auxiliary_loss_mlp": 0.01089784, + "balance_loss_clip": 1.03835714, + "balance_loss_mlp": 1.00752938, + "epoch": 0.228701977995551, + "flos": 24170826332160.0, + "grad_norm": 2.0462498877119932, + "language_loss": 0.78279608, + "learning_rate": 3.6000846297616373e-06, + "loss": 0.8051728, + "num_input_tokens_seen": 40497880, + "step": 1902, + "time_per_iteration": 3.727419376373291 + }, + { + "auxiliary_loss_clip": 0.01161065, + "auxiliary_loss_mlp": 0.01089156, + "balance_loss_clip": 1.03928614, + "balance_loss_mlp": 1.00694966, + "epoch": 0.22882222088619011, + "flos": 21387892308480.0, + "grad_norm": 2.1618260789547636, + "language_loss": 0.72691953, + "learning_rate": 3.5996171693719717e-06, + "loss": 0.74942172, + "num_input_tokens_seen": 40513975, + "step": 1903, + "time_per_iteration": 2.711097478866577 + }, + { + "auxiliary_loss_clip": 0.01142158, + "auxiliary_loss_mlp": 0.01079842, + "balance_loss_clip": 1.04099786, + "balance_loss_mlp": 1.00011528, + "epoch": 0.2289424637768292, + "flos": 64589615377920.0, + "grad_norm": 0.8306398665515339, + "language_loss": 0.64783967, + "learning_rate": 3.5991494663216528e-06, + "loss": 0.67005968, + "num_input_tokens_seen": 40576960, + "step": 1904, + "time_per_iteration": 3.3437533378601074 + }, + { + "auxiliary_loss_clip": 0.01160778, + "auxiliary_loss_mlp": 0.01087508, + "balance_loss_clip": 1.03879559, + "balance_loss_mlp": 1.00553966, + "epoch": 0.22906270666746828, + "flos": 22163419877760.0, + "grad_norm": 1.8803728907443562, + "language_loss": 0.87795699, + "learning_rate": 3.5986815206816314e-06, + "loss": 0.90043986, + "num_input_tokens_seen": 40595780, + "step": 1905, + "time_per_iteration": 2.7766544818878174 + }, + { + "auxiliary_loss_clip": 0.01159844, + "auxiliary_loss_mlp": 0.01087603, + "balance_loss_clip": 1.03790092, + "balance_loss_mlp": 1.00582504, + "epoch": 0.2291829495581074, + "flos": 25772334122880.0, + "grad_norm": 2.153374005591701, + "language_loss": 0.74513745, + "learning_rate": 3.598213332522895e-06, + "loss": 0.76761192, + "num_input_tokens_seen": 40615810, + "step": 1906, + "time_per_iteration": 2.7792277336120605 + }, + { + "auxiliary_loss_clip": 0.01152431, + "auxiliary_loss_mlp": 0.01090029, + "balance_loss_clip": 1.03876007, + "balance_loss_mlp": 1.00810838, + "epoch": 0.22930319244874647, + "flos": 31172760126720.0, + "grad_norm": 1.6886889594900507, + "language_loss": 0.77295482, + "learning_rate": 3.597744901916466e-06, + "loss": 0.7953794, + "num_input_tokens_seen": 40637095, + "step": 1907, + "time_per_iteration": 2.9144906997680664 + }, + { + "auxiliary_loss_clip": 0.01159498, + "auxiliary_loss_mlp": 0.01088366, + "balance_loss_clip": 1.03668308, + "balance_loss_mlp": 1.00611162, + "epoch": 0.22942343533938556, + "flos": 23254098399360.0, + "grad_norm": 2.085075473356679, + "language_loss": 0.76939571, + "learning_rate": 3.5972762289334058e-06, + "loss": 0.79187441, + "num_input_tokens_seen": 40656725, + "step": 1908, + "time_per_iteration": 2.7529983520507812 + }, + { + "auxiliary_loss_clip": 0.01098829, + "auxiliary_loss_mlp": 0.01087717, + "balance_loss_clip": 1.02909374, + "balance_loss_mlp": 1.00589132, + "epoch": 0.22954367823002464, + "flos": 14610903436800.0, + "grad_norm": 1.918972855186975, + "language_loss": 0.85003608, + "learning_rate": 3.5968073136448116e-06, + "loss": 0.87190151, + "num_input_tokens_seen": 40674745, + "step": 1909, + "time_per_iteration": 2.8891561031341553 + }, + { + "auxiliary_loss_clip": 0.01150972, + "auxiliary_loss_mlp": 0.01088615, + "balance_loss_clip": 1.03706336, + "balance_loss_mlp": 1.00655103, + "epoch": 0.22966392112066375, + "flos": 16763604405120.0, + "grad_norm": 1.6685900349030856, + "language_loss": 0.91235882, + "learning_rate": 3.596338156121818e-06, + "loss": 0.93475467, + "num_input_tokens_seen": 40693630, + "step": 1910, + "time_per_iteration": 2.769634485244751 + }, + { + "auxiliary_loss_clip": 0.01130559, + "auxiliary_loss_mlp": 0.01079681, + "balance_loss_clip": 1.03787184, + "balance_loss_mlp": 0.99995345, + "epoch": 0.22978416401130283, + "flos": 67474247783040.0, + "grad_norm": 0.7466336108972953, + "language_loss": 0.59339881, + "learning_rate": 3.595868756435595e-06, + "loss": 0.61550117, + "num_input_tokens_seen": 40761310, + "step": 1911, + "time_per_iteration": 3.409031867980957 + }, + { + "auxiliary_loss_clip": 0.01128509, + "auxiliary_loss_mlp": 0.01088944, + "balance_loss_clip": 1.03358293, + "balance_loss_mlp": 1.00688076, + "epoch": 0.22990440690194192, + "flos": 19865137086720.0, + "grad_norm": 3.090516161747712, + "language_loss": 0.80508554, + "learning_rate": 3.5953991146573504e-06, + "loss": 0.82726008, + "num_input_tokens_seen": 40779955, + "step": 1912, + "time_per_iteration": 2.8018529415130615 + }, + { + "auxiliary_loss_clip": 0.01150113, + "auxiliary_loss_mlp": 0.01090462, + "balance_loss_clip": 1.03597784, + "balance_loss_mlp": 1.00830293, + "epoch": 0.23002464979258103, + "flos": 13289246507520.0, + "grad_norm": 2.9408558798259468, + "language_loss": 0.83933389, + "learning_rate": 3.5949292308583294e-06, + "loss": 0.86173958, + "num_input_tokens_seen": 40793200, + "step": 1913, + "time_per_iteration": 2.7347469329833984 + }, + { + "auxiliary_loss_clip": 0.01161064, + "auxiliary_loss_mlp": 0.01090687, + "balance_loss_clip": 1.03920114, + "balance_loss_mlp": 1.00867093, + "epoch": 0.2301448926832201, + "flos": 22163779013760.0, + "grad_norm": 3.1543110384206683, + "language_loss": 0.80861247, + "learning_rate": 3.594459105109811e-06, + "loss": 0.83113003, + "num_input_tokens_seen": 40812380, + "step": 1914, + "time_per_iteration": 2.690650701522827 + }, + { + "auxiliary_loss_clip": 0.01150981, + "auxiliary_loss_mlp": 0.01091069, + "balance_loss_clip": 1.03775644, + "balance_loss_mlp": 1.00905275, + "epoch": 0.2302651355738592, + "flos": 20704477167360.0, + "grad_norm": 1.7456715015375117, + "language_loss": 0.80964619, + "learning_rate": 3.593988737483115e-06, + "loss": 0.83206666, + "num_input_tokens_seen": 40832320, + "step": 1915, + "time_per_iteration": 2.6789472103118896 + }, + { + "auxiliary_loss_clip": 0.01138425, + "auxiliary_loss_mlp": 0.01090592, + "balance_loss_clip": 1.03713155, + "balance_loss_mlp": 1.00852823, + "epoch": 0.23038537846449827, + "flos": 18588943797120.0, + "grad_norm": 2.8144859046662147, + "language_loss": 0.78218007, + "learning_rate": 3.5935181280495947e-06, + "loss": 0.80447024, + "num_input_tokens_seen": 40850900, + "step": 1916, + "time_per_iteration": 2.779022455215454 + }, + { + "auxiliary_loss_clip": 0.01116414, + "auxiliary_loss_mlp": 0.01080328, + "balance_loss_clip": 1.03513288, + "balance_loss_mlp": 1.00060129, + "epoch": 0.23050562135513739, + "flos": 64224260190720.0, + "grad_norm": 0.8014855315777387, + "language_loss": 0.54376078, + "learning_rate": 3.5930472768806412e-06, + "loss": 0.56572819, + "num_input_tokens_seen": 40909570, + "step": 1917, + "time_per_iteration": 3.2435216903686523 + }, + { + "auxiliary_loss_clip": 0.01161385, + "auxiliary_loss_mlp": 0.01092274, + "balance_loss_clip": 1.03985322, + "balance_loss_mlp": 1.01016223, + "epoch": 0.23062586424577647, + "flos": 17313396952320.0, + "grad_norm": 4.197893759001274, + "language_loss": 0.77180976, + "learning_rate": 3.5925761840476826e-06, + "loss": 0.79434633, + "num_input_tokens_seen": 40928180, + "step": 1918, + "time_per_iteration": 2.7002384662628174 + }, + { + "auxiliary_loss_clip": 0.01136429, + "auxiliary_loss_mlp": 0.01089361, + "balance_loss_clip": 1.03482354, + "balance_loss_mlp": 1.00748849, + "epoch": 0.23074610713641555, + "flos": 27855979194240.0, + "grad_norm": 2.157414143528037, + "language_loss": 0.81374025, + "learning_rate": 3.592104849622183e-06, + "loss": 0.83599818, + "num_input_tokens_seen": 40950435, + "step": 1919, + "time_per_iteration": 2.790379524230957 + }, + { + "auxiliary_loss_clip": 0.01118263, + "auxiliary_loss_mlp": 0.01089417, + "balance_loss_clip": 1.03290534, + "balance_loss_mlp": 1.00740063, + "epoch": 0.23086635002705466, + "flos": 28841798937600.0, + "grad_norm": 1.7133901716160693, + "language_loss": 0.73139054, + "learning_rate": 3.591633273675644e-06, + "loss": 0.75346732, + "num_input_tokens_seen": 40972670, + "step": 1920, + "time_per_iteration": 3.773115634918213 + }, + { + "auxiliary_loss_clip": 0.01093627, + "auxiliary_loss_mlp": 0.01081725, + "balance_loss_clip": 1.02258253, + "balance_loss_mlp": 1.00199783, + "epoch": 0.23098659291769374, + "flos": 62923681566720.0, + "grad_norm": 0.9122248205332585, + "language_loss": 0.58191991, + "learning_rate": 3.591161456279602e-06, + "loss": 0.60367346, + "num_input_tokens_seen": 41018215, + "step": 1921, + "time_per_iteration": 3.1289610862731934 + }, + { + "auxiliary_loss_clip": 0.01140736, + "auxiliary_loss_mlp": 0.01090291, + "balance_loss_clip": 1.0353353, + "balance_loss_mlp": 1.00813174, + "epoch": 0.23110683580833283, + "flos": 23476816679040.0, + "grad_norm": 1.5911307265893742, + "language_loss": 0.80181241, + "learning_rate": 3.590689397505633e-06, + "loss": 0.82412273, + "num_input_tokens_seen": 41039125, + "step": 1922, + "time_per_iteration": 2.7665414810180664 + }, + { + "auxiliary_loss_clip": 0.01160144, + "auxiliary_loss_mlp": 0.01089282, + "balance_loss_clip": 1.0389502, + "balance_loss_mlp": 1.00750422, + "epoch": 0.2312270786989719, + "flos": 27271066124160.0, + "grad_norm": 1.8058049541606527, + "language_loss": 0.86721814, + "learning_rate": 3.590217097425347e-06, + "loss": 0.88971239, + "num_input_tokens_seen": 41059025, + "step": 1923, + "time_per_iteration": 3.69224214553833 + }, + { + "auxiliary_loss_clip": 0.01160838, + "auxiliary_loss_mlp": 0.01090653, + "balance_loss_clip": 1.03902781, + "balance_loss_mlp": 1.00863731, + "epoch": 0.23134732158961102, + "flos": 13261344618240.0, + "grad_norm": 2.1274071435249433, + "language_loss": 0.71358943, + "learning_rate": 3.589744556110391e-06, + "loss": 0.73610437, + "num_input_tokens_seen": 41077015, + "step": 1924, + "time_per_iteration": 2.679471254348755 + }, + { + "auxiliary_loss_clip": 0.01141503, + "auxiliary_loss_mlp": 0.01090631, + "balance_loss_clip": 1.03567314, + "balance_loss_mlp": 1.00851929, + "epoch": 0.2314675644802501, + "flos": 36977648250240.0, + "grad_norm": 1.5179272814996077, + "language_loss": 0.84273291, + "learning_rate": 3.58927177363245e-06, + "loss": 0.86505425, + "num_input_tokens_seen": 41099840, + "step": 1925, + "time_per_iteration": 2.8939130306243896 + }, + { + "auxiliary_loss_clip": 0.01125168, + "auxiliary_loss_mlp": 0.01090288, + "balance_loss_clip": 1.03549671, + "balance_loss_mlp": 1.0080812, + "epoch": 0.2315878073708892, + "flos": 23842207779840.0, + "grad_norm": 2.4627756109500996, + "language_loss": 0.7267549, + "learning_rate": 3.5887987500632447e-06, + "loss": 0.74890947, + "num_input_tokens_seen": 41117845, + "step": 1926, + "time_per_iteration": 3.764409065246582 + }, + { + "auxiliary_loss_clip": 0.01133404, + "auxiliary_loss_mlp": 0.01090109, + "balance_loss_clip": 1.03693628, + "balance_loss_mlp": 1.00842679, + "epoch": 0.2317080502615283, + "flos": 23039424766080.0, + "grad_norm": 1.7035116943411739, + "language_loss": 0.84213424, + "learning_rate": 3.5883254854745325e-06, + "loss": 0.86436939, + "num_input_tokens_seen": 41136235, + "step": 1927, + "time_per_iteration": 2.805555582046509 + }, + { + "auxiliary_loss_clip": 0.0115225, + "auxiliary_loss_mlp": 0.01089348, + "balance_loss_clip": 1.03835618, + "balance_loss_mlp": 1.00728452, + "epoch": 0.23182829315216738, + "flos": 11254656435840.0, + "grad_norm": 1.9482773625974295, + "language_loss": 0.74920219, + "learning_rate": 3.587851979938107e-06, + "loss": 0.77161819, + "num_input_tokens_seen": 41153125, + "step": 1928, + "time_per_iteration": 3.6240921020507812 + }, + { + "auxiliary_loss_clip": 0.01149344, + "auxiliary_loss_mlp": 0.01090271, + "balance_loss_clip": 1.03665876, + "balance_loss_mlp": 1.00835085, + "epoch": 0.23194853604280646, + "flos": 19828939155840.0, + "grad_norm": 1.9004304353120243, + "language_loss": 0.77398825, + "learning_rate": 3.5873782335257985e-06, + "loss": 0.79638445, + "num_input_tokens_seen": 41171290, + "step": 1929, + "time_per_iteration": 2.7235426902770996 + }, + { + "auxiliary_loss_clip": 0.01120253, + "auxiliary_loss_mlp": 0.01089033, + "balance_loss_clip": 1.03379118, + "balance_loss_mlp": 1.00692141, + "epoch": 0.23206877893344555, + "flos": 15305020830720.0, + "grad_norm": 2.05396109238414, + "language_loss": 0.78394663, + "learning_rate": 3.5869042463094744e-06, + "loss": 0.80603945, + "num_input_tokens_seen": 41189005, + "step": 1930, + "time_per_iteration": 2.7533626556396484 + }, + { + "auxiliary_loss_clip": 0.01114245, + "auxiliary_loss_mlp": 0.01089914, + "balance_loss_clip": 1.03299129, + "balance_loss_mlp": 1.00789762, + "epoch": 0.23218902182408466, + "flos": 22711488572160.0, + "grad_norm": 1.8259179815737772, + "language_loss": 0.76919574, + "learning_rate": 3.586430018361038e-06, + "loss": 0.79123729, + "num_input_tokens_seen": 41208775, + "step": 1931, + "time_per_iteration": 2.895256757736206 + }, + { + "auxiliary_loss_clip": 0.01141211, + "auxiliary_loss_mlp": 0.01089396, + "balance_loss_clip": 1.03626418, + "balance_loss_mlp": 1.00728464, + "epoch": 0.23230926471472374, + "flos": 22710734386560.0, + "grad_norm": 1.8605016413631412, + "language_loss": 0.75730884, + "learning_rate": 3.5859555497524283e-06, + "loss": 0.77961493, + "num_input_tokens_seen": 41226010, + "step": 1932, + "time_per_iteration": 2.7437474727630615 + }, + { + "auxiliary_loss_clip": 0.011512, + "auxiliary_loss_mlp": 0.01089195, + "balance_loss_clip": 1.03838038, + "balance_loss_mlp": 1.00746524, + "epoch": 0.23242950760536282, + "flos": 20375499479040.0, + "grad_norm": 2.8502449784103923, + "language_loss": 0.92041433, + "learning_rate": 3.5854808405556237e-06, + "loss": 0.94281828, + "num_input_tokens_seen": 41245245, + "step": 1933, + "time_per_iteration": 2.707749843597412 + }, + { + "auxiliary_loss_clip": 0.01128565, + "auxiliary_loss_mlp": 0.0108932, + "balance_loss_clip": 1.03322053, + "balance_loss_mlp": 1.00749493, + "epoch": 0.23254975049600193, + "flos": 16908324301440.0, + "grad_norm": 2.799926106760316, + "language_loss": 0.74526572, + "learning_rate": 3.5850058908426355e-06, + "loss": 0.76744449, + "num_input_tokens_seen": 41263795, + "step": 1934, + "time_per_iteration": 2.826068639755249 + }, + { + "auxiliary_loss_clip": 0.01141575, + "auxiliary_loss_mlp": 0.01090077, + "balance_loss_clip": 1.03579879, + "balance_loss_mlp": 1.00825179, + "epoch": 0.23266999338664102, + "flos": 23294821443840.0, + "grad_norm": 1.6850882497348414, + "language_loss": 0.85300344, + "learning_rate": 3.584530700685514e-06, + "loss": 0.87532002, + "num_input_tokens_seen": 41284055, + "step": 1935, + "time_per_iteration": 2.743833065032959 + }, + { + "auxiliary_loss_clip": 0.01137072, + "auxiliary_loss_mlp": 0.01088673, + "balance_loss_clip": 1.0351541, + "balance_loss_mlp": 1.00675225, + "epoch": 0.2327902362772801, + "flos": 19569987031680.0, + "grad_norm": 2.0618641895001772, + "language_loss": 0.884516, + "learning_rate": 3.5840552701563448e-06, + "loss": 0.90677345, + "num_input_tokens_seen": 41300255, + "step": 1936, + "time_per_iteration": 2.764390468597412 + }, + { + "auxiliary_loss_clip": 0.01159445, + "auxiliary_loss_mlp": 0.01088176, + "balance_loss_clip": 1.03780508, + "balance_loss_mlp": 1.0063988, + "epoch": 0.2329104791679192, + "flos": 16727514215040.0, + "grad_norm": 2.119209167032379, + "language_loss": 0.8158769, + "learning_rate": 3.5835795993272513e-06, + "loss": 0.8383531, + "num_input_tokens_seen": 41318540, + "step": 1937, + "time_per_iteration": 2.821723699569702 + }, + { + "auxiliary_loss_clip": 0.01085934, + "auxiliary_loss_mlp": 0.01091633, + "balance_loss_clip": 1.03160429, + "balance_loss_mlp": 1.00980771, + "epoch": 0.2330307220585583, + "flos": 22163743100160.0, + "grad_norm": 2.399364483136531, + "language_loss": 0.71430278, + "learning_rate": 3.583103688270391e-06, + "loss": 0.7360785, + "num_input_tokens_seen": 41338320, + "step": 1938, + "time_per_iteration": 2.987051010131836 + }, + { + "auxiliary_loss_clip": 0.01143579, + "auxiliary_loss_mlp": 0.01089512, + "balance_loss_clip": 1.03800964, + "balance_loss_mlp": 1.00740075, + "epoch": 0.23315096494919738, + "flos": 19317319787520.0, + "grad_norm": 2.0303429328760996, + "language_loss": 0.89488888, + "learning_rate": 3.58262753705796e-06, + "loss": 0.91721976, + "num_input_tokens_seen": 41353210, + "step": 1939, + "time_per_iteration": 2.7768747806549072 + }, + { + "auxiliary_loss_clip": 0.01127774, + "auxiliary_loss_mlp": 0.01080005, + "balance_loss_clip": 1.03592491, + "balance_loss_mlp": 1.000278, + "epoch": 0.23327120783983646, + "flos": 53031048946560.0, + "grad_norm": 0.8321843043121666, + "language_loss": 0.55533445, + "learning_rate": 3.5821511457621902e-06, + "loss": 0.57741225, + "num_input_tokens_seen": 41410510, + "step": 1940, + "time_per_iteration": 3.3795149326324463 + }, + { + "auxiliary_loss_clip": 0.01138284, + "auxiliary_loss_mlp": 0.01090685, + "balance_loss_clip": 1.0351336, + "balance_loss_mlp": 1.00852537, + "epoch": 0.23339145073047557, + "flos": 17126984344320.0, + "grad_norm": 9.88344574907506, + "language_loss": 0.80612433, + "learning_rate": 3.5816745144553497e-06, + "loss": 0.82841396, + "num_input_tokens_seen": 41425830, + "step": 1941, + "time_per_iteration": 2.812556028366089 + }, + { + "auxiliary_loss_clip": 0.01112467, + "auxiliary_loss_mlp": 0.01088172, + "balance_loss_clip": 1.03006065, + "balance_loss_mlp": 1.00620389, + "epoch": 0.23351169362111465, + "flos": 13078918419840.0, + "grad_norm": 2.0455012387100586, + "language_loss": 0.75400287, + "learning_rate": 3.5811976432097424e-06, + "loss": 0.7760092, + "num_input_tokens_seen": 41443500, + "step": 1942, + "time_per_iteration": 2.8034005165100098 + }, + { + "auxiliary_loss_clip": 0.01150331, + "auxiliary_loss_mlp": 0.0087402, + "balance_loss_clip": 1.03818691, + "balance_loss_mlp": 0.99993807, + "epoch": 0.23363193651175373, + "flos": 15851257931520.0, + "grad_norm": 2.0405354699178746, + "language_loss": 0.8421827, + "learning_rate": 3.58072053209771e-06, + "loss": 0.86242622, + "num_input_tokens_seen": 41460055, + "step": 1943, + "time_per_iteration": 2.733518123626709 + }, + { + "auxiliary_loss_clip": 0.0114175, + "auxiliary_loss_mlp": 0.01090987, + "balance_loss_clip": 1.03621531, + "balance_loss_mlp": 1.00887537, + "epoch": 0.23375217940239285, + "flos": 21025769345280.0, + "grad_norm": 2.304744705764831, + "language_loss": 0.7904737, + "learning_rate": 3.5802431811916296e-06, + "loss": 0.81280106, + "num_input_tokens_seen": 41476665, + "step": 1944, + "time_per_iteration": 2.725179672241211 + }, + { + "auxiliary_loss_clip": 0.01136027, + "auxiliary_loss_mlp": 0.01088637, + "balance_loss_clip": 1.03386343, + "balance_loss_mlp": 1.00676453, + "epoch": 0.23387242229303193, + "flos": 20594698225920.0, + "grad_norm": 1.6490715566460168, + "language_loss": 0.80587661, + "learning_rate": 3.579765590563916e-06, + "loss": 0.82812327, + "num_input_tokens_seen": 41496065, + "step": 1945, + "time_per_iteration": 3.6707794666290283 + }, + { + "auxiliary_loss_clip": 0.01148531, + "auxiliary_loss_mlp": 0.01088266, + "balance_loss_clip": 1.03613353, + "balance_loss_mlp": 1.00634503, + "epoch": 0.233992665183671, + "flos": 24279491952000.0, + "grad_norm": 1.8306208456536708, + "language_loss": 0.82179117, + "learning_rate": 3.579287760287017e-06, + "loss": 0.84415913, + "num_input_tokens_seen": 41516815, + "step": 1946, + "time_per_iteration": 2.702812910079956 + }, + { + "auxiliary_loss_clip": 0.01150313, + "auxiliary_loss_mlp": 0.01089934, + "balance_loss_clip": 1.0385592, + "balance_loss_mlp": 1.00815606, + "epoch": 0.2341129080743101, + "flos": 30154621121280.0, + "grad_norm": 1.808166512405316, + "language_loss": 0.72591734, + "learning_rate": 3.578809690433421e-06, + "loss": 0.7483198, + "num_input_tokens_seen": 41538525, + "step": 1947, + "time_per_iteration": 2.780813694000244 + }, + { + "auxiliary_loss_clip": 0.01161967, + "auxiliary_loss_mlp": 0.01088422, + "balance_loss_clip": 1.04055727, + "balance_loss_mlp": 1.00635791, + "epoch": 0.2342331509649492, + "flos": 22784135829120.0, + "grad_norm": 2.967717528912398, + "language_loss": 0.8136642, + "learning_rate": 3.578331381075651e-06, + "loss": 0.83616805, + "num_input_tokens_seen": 41559025, + "step": 1948, + "time_per_iteration": 3.66050124168396 + }, + { + "auxiliary_loss_clip": 0.01151154, + "auxiliary_loss_mlp": 0.0108993, + "balance_loss_clip": 1.03759241, + "balance_loss_mlp": 1.00777102, + "epoch": 0.2343533938555883, + "flos": 23623152687360.0, + "grad_norm": 2.4415455530010393, + "language_loss": 0.69611335, + "learning_rate": 3.5778528322862646e-06, + "loss": 0.7185241, + "num_input_tokens_seen": 41577845, + "step": 1949, + "time_per_iteration": 2.7446587085723877 + }, + { + "auxiliary_loss_clip": 0.01144573, + "auxiliary_loss_mlp": 0.01088545, + "balance_loss_clip": 1.03829706, + "balance_loss_mlp": 1.00662398, + "epoch": 0.23447363674622737, + "flos": 24570332375040.0, + "grad_norm": 1.5558317929650705, + "language_loss": 0.86286169, + "learning_rate": 3.5773740441378585e-06, + "loss": 0.88519287, + "num_input_tokens_seen": 41598600, + "step": 1950, + "time_per_iteration": 2.797917366027832 + }, + { + "auxiliary_loss_clip": 0.01147253, + "auxiliary_loss_mlp": 0.01086872, + "balance_loss_clip": 1.03552699, + "balance_loss_mlp": 1.00504649, + "epoch": 0.23459387963686648, + "flos": 53140322119680.0, + "grad_norm": 1.7684778825716774, + "language_loss": 0.73816419, + "learning_rate": 3.5768950167030633e-06, + "loss": 0.76050544, + "num_input_tokens_seen": 41623300, + "step": 1951, + "time_per_iteration": 3.960767984390259 + }, + { + "auxiliary_loss_clip": 0.01140205, + "auxiliary_loss_mlp": 0.01090372, + "balance_loss_clip": 1.03646016, + "balance_loss_mlp": 1.0084517, + "epoch": 0.23471412252750556, + "flos": 23951412103680.0, + "grad_norm": 2.289580100475449, + "language_loss": 0.78587902, + "learning_rate": 3.576415750054548e-06, + "loss": 0.80818474, + "num_input_tokens_seen": 41643420, + "step": 1952, + "time_per_iteration": 2.757080078125 + }, + { + "auxiliary_loss_clip": 0.01138359, + "auxiliary_loss_mlp": 0.01089591, + "balance_loss_clip": 1.0344882, + "balance_loss_mlp": 1.0075748, + "epoch": 0.23483436541814465, + "flos": 15706573948800.0, + "grad_norm": 2.049940097245284, + "language_loss": 0.85517633, + "learning_rate": 3.5759362442650172e-06, + "loss": 0.87745583, + "num_input_tokens_seen": 41660170, + "step": 1953, + "time_per_iteration": 3.6131107807159424 + }, + { + "auxiliary_loss_clip": 0.01142872, + "auxiliary_loss_mlp": 0.01089562, + "balance_loss_clip": 1.03301418, + "balance_loss_mlp": 1.00764108, + "epoch": 0.23495460830878373, + "flos": 24936262179840.0, + "grad_norm": 2.1565561224531415, + "language_loss": 0.85135436, + "learning_rate": 3.5754564994072113e-06, + "loss": 0.87367868, + "num_input_tokens_seen": 41679010, + "step": 1954, + "time_per_iteration": 2.735025405883789 + }, + { + "auxiliary_loss_clip": 0.01141427, + "auxiliary_loss_mlp": 0.0108914, + "balance_loss_clip": 1.0363574, + "balance_loss_mlp": 1.00721955, + "epoch": 0.23507485119942284, + "flos": 30482665056000.0, + "grad_norm": 2.0072094898019324, + "language_loss": 0.5966047, + "learning_rate": 3.5749765155539067e-06, + "loss": 0.61891031, + "num_input_tokens_seen": 41699495, + "step": 1955, + "time_per_iteration": 2.76664137840271 + }, + { + "auxiliary_loss_clip": 0.01129559, + "auxiliary_loss_mlp": 0.01088445, + "balance_loss_clip": 1.03512311, + "balance_loss_mlp": 1.00633311, + "epoch": 0.23519509409006192, + "flos": 18329129746560.0, + "grad_norm": 1.9395943000730707, + "language_loss": 0.92308152, + "learning_rate": 3.574496292777917e-06, + "loss": 0.9452616, + "num_input_tokens_seen": 41717705, + "step": 1956, + "time_per_iteration": 2.801938533782959 + }, + { + "auxiliary_loss_clip": 0.01138995, + "auxiliary_loss_mlp": 0.01090422, + "balance_loss_clip": 1.03513741, + "balance_loss_mlp": 1.00802445, + "epoch": 0.235315336980701, + "flos": 29643217234560.0, + "grad_norm": 2.1255081078369242, + "language_loss": 0.71428835, + "learning_rate": 3.574015831152092e-06, + "loss": 0.73658252, + "num_input_tokens_seen": 41738120, + "step": 1957, + "time_per_iteration": 2.8021488189697266 + }, + { + "auxiliary_loss_clip": 0.01136242, + "auxiliary_loss_mlp": 0.01089199, + "balance_loss_clip": 1.03556883, + "balance_loss_mlp": 1.00723028, + "epoch": 0.23543557987134012, + "flos": 18551704371840.0, + "grad_norm": 2.0791484707672185, + "language_loss": 0.83400142, + "learning_rate": 3.573535130749316e-06, + "loss": 0.85625577, + "num_input_tokens_seen": 41756070, + "step": 1958, + "time_per_iteration": 2.716547966003418 + }, + { + "auxiliary_loss_clip": 0.01137722, + "auxiliary_loss_mlp": 0.01088608, + "balance_loss_clip": 1.03646278, + "balance_loss_mlp": 1.00683045, + "epoch": 0.2355558227619792, + "flos": 24679033908480.0, + "grad_norm": 1.6626051384401042, + "language_loss": 0.7376622, + "learning_rate": 3.5730541916425127e-06, + "loss": 0.75992548, + "num_input_tokens_seen": 41777550, + "step": 1959, + "time_per_iteration": 2.844752550125122 + }, + { + "auxiliary_loss_clip": 0.01129431, + "auxiliary_loss_mlp": 0.01088718, + "balance_loss_clip": 1.0344193, + "balance_loss_mlp": 1.00694084, + "epoch": 0.23567606565261828, + "flos": 21944795748480.0, + "grad_norm": 1.8408734627500078, + "language_loss": 0.85946983, + "learning_rate": 3.572573013904639e-06, + "loss": 0.88165128, + "num_input_tokens_seen": 41797460, + "step": 1960, + "time_per_iteration": 2.782593011856079 + }, + { + "auxiliary_loss_clip": 0.01158679, + "auxiliary_loss_mlp": 0.01089527, + "balance_loss_clip": 1.03743291, + "balance_loss_mlp": 1.00755835, + "epoch": 0.2357963085432574, + "flos": 13589352639360.0, + "grad_norm": 1.8087210251216628, + "language_loss": 0.92019892, + "learning_rate": 3.572091597608689e-06, + "loss": 0.94268095, + "num_input_tokens_seen": 41815585, + "step": 1961, + "time_per_iteration": 2.7647366523742676 + }, + { + "auxiliary_loss_clip": 0.01138944, + "auxiliary_loss_mlp": 0.01090135, + "balance_loss_clip": 1.03616941, + "balance_loss_mlp": 1.00816655, + "epoch": 0.23591655143389648, + "flos": 22088689632000.0, + "grad_norm": 2.128945093565922, + "language_loss": 0.73054385, + "learning_rate": 3.571609942827694e-06, + "loss": 0.75283462, + "num_input_tokens_seen": 41834700, + "step": 1962, + "time_per_iteration": 2.7083985805511475 + }, + { + "auxiliary_loss_clip": 0.01138008, + "auxiliary_loss_mlp": 0.01090775, + "balance_loss_clip": 1.03493762, + "balance_loss_mlp": 1.00904477, + "epoch": 0.23603679432453556, + "flos": 17017349057280.0, + "grad_norm": 1.6449624586620213, + "language_loss": 0.88555437, + "learning_rate": 3.57112804963472e-06, + "loss": 0.90784222, + "num_input_tokens_seen": 41852915, + "step": 1963, + "time_per_iteration": 2.785682439804077 + }, + { + "auxiliary_loss_clip": 0.01126658, + "auxiliary_loss_mlp": 0.01088975, + "balance_loss_clip": 1.03511524, + "balance_loss_mlp": 1.0072453, + "epoch": 0.23615703721517464, + "flos": 19171307001600.0, + "grad_norm": 1.7593283819127767, + "language_loss": 0.76200795, + "learning_rate": 3.57064591810287e-06, + "loss": 0.78416431, + "num_input_tokens_seen": 41870415, + "step": 1964, + "time_per_iteration": 2.774378776550293 + }, + { + "auxiliary_loss_clip": 0.01159743, + "auxiliary_loss_mlp": 0.00873866, + "balance_loss_clip": 1.03871465, + "balance_loss_mlp": 0.99996567, + "epoch": 0.23627728010581375, + "flos": 19098803399040.0, + "grad_norm": 2.0086102698450485, + "language_loss": 0.80682176, + "learning_rate": 3.570163548305284e-06, + "loss": 0.82715786, + "num_input_tokens_seen": 41889345, + "step": 1965, + "time_per_iteration": 2.6801552772521973 + }, + { + "auxiliary_loss_clip": 0.01135876, + "auxiliary_loss_mlp": 0.01088497, + "balance_loss_clip": 1.03336394, + "balance_loss_mlp": 1.00652909, + "epoch": 0.23639752299645284, + "flos": 14282213057280.0, + "grad_norm": 2.027584707636934, + "language_loss": 0.69922853, + "learning_rate": 3.569680940315135e-06, + "loss": 0.72147226, + "num_input_tokens_seen": 41905745, + "step": 1966, + "time_per_iteration": 2.707742214202881 + }, + { + "auxiliary_loss_clip": 0.01130466, + "auxiliary_loss_mlp": 0.01088661, + "balance_loss_clip": 1.03535998, + "balance_loss_mlp": 1.00640678, + "epoch": 0.23651776588709192, + "flos": 22893411980160.0, + "grad_norm": 1.7546220936652763, + "language_loss": 0.82203764, + "learning_rate": 3.5691980942056356e-06, + "loss": 0.84422892, + "num_input_tokens_seen": 41925115, + "step": 1967, + "time_per_iteration": 2.814364194869995 + }, + { + "auxiliary_loss_clip": 0.01150701, + "auxiliary_loss_mlp": 0.010885, + "balance_loss_clip": 1.03722274, + "balance_loss_mlp": 1.00662756, + "epoch": 0.23663800877773103, + "flos": 18624531196800.0, + "grad_norm": 1.836179868355636, + "language_loss": 0.79775178, + "learning_rate": 3.5687150100500332e-06, + "loss": 0.82014382, + "num_input_tokens_seen": 41944815, + "step": 1968, + "time_per_iteration": 2.671853542327881 + }, + { + "auxiliary_loss_clip": 0.01149588, + "auxiliary_loss_mlp": 0.0108895, + "balance_loss_clip": 1.03689432, + "balance_loss_mlp": 1.00707686, + "epoch": 0.2367582516683701, + "flos": 25555828896000.0, + "grad_norm": 1.9250184909878862, + "language_loss": 0.7415418, + "learning_rate": 3.568231687921611e-06, + "loss": 0.76392716, + "num_input_tokens_seen": 41964990, + "step": 1969, + "time_per_iteration": 2.7785301208496094 + }, + { + "auxiliary_loss_clip": 0.01158814, + "auxiliary_loss_mlp": 0.0108961, + "balance_loss_clip": 1.03749824, + "balance_loss_mlp": 1.00783193, + "epoch": 0.2368784945590092, + "flos": 23295072839040.0, + "grad_norm": 1.5155875879031315, + "language_loss": 0.80557871, + "learning_rate": 3.5677481278936883e-06, + "loss": 0.82806301, + "num_input_tokens_seen": 41984570, + "step": 1970, + "time_per_iteration": 3.6564865112304688 + }, + { + "auxiliary_loss_clip": 0.0112368, + "auxiliary_loss_mlp": 0.01079994, + "balance_loss_clip": 1.03194642, + "balance_loss_mlp": 1.00026667, + "epoch": 0.23699873744964828, + "flos": 69859291875840.0, + "grad_norm": 0.828152772191864, + "language_loss": 0.5785445, + "learning_rate": 3.5672643300396214e-06, + "loss": 0.60058129, + "num_input_tokens_seen": 42053715, + "step": 1971, + "time_per_iteration": 3.3452136516571045 + }, + { + "auxiliary_loss_clip": 0.01128705, + "auxiliary_loss_mlp": 0.0108963, + "balance_loss_clip": 1.03450847, + "balance_loss_mlp": 1.00790048, + "epoch": 0.2371189803402874, + "flos": 21835052720640.0, + "grad_norm": 1.9117022286993413, + "language_loss": 0.67677355, + "learning_rate": 3.566780294432802e-06, + "loss": 0.69895685, + "num_input_tokens_seen": 42070890, + "step": 1972, + "time_per_iteration": 2.841747283935547 + }, + { + "auxiliary_loss_clip": 0.01159045, + "auxiliary_loss_mlp": 0.01088717, + "balance_loss_clip": 1.03832877, + "balance_loss_mlp": 1.00693965, + "epoch": 0.23723922323092647, + "flos": 21908490076800.0, + "grad_norm": 2.1740733818773146, + "language_loss": 0.74778926, + "learning_rate": 3.566296021146657e-06, + "loss": 0.77026695, + "num_input_tokens_seen": 42090270, + "step": 1973, + "time_per_iteration": 3.657330274581909 + }, + { + "auxiliary_loss_clip": 0.01159394, + "auxiliary_loss_mlp": 0.01088004, + "balance_loss_clip": 1.03803444, + "balance_loss_mlp": 1.00613081, + "epoch": 0.23735946612156555, + "flos": 32708803380480.0, + "grad_norm": 1.562831164929452, + "language_loss": 0.7308079, + "learning_rate": 3.565811510254652e-06, + "loss": 0.75328195, + "num_input_tokens_seen": 42111150, + "step": 1974, + "time_per_iteration": 2.7290918827056885 + }, + { + "auxiliary_loss_clip": 0.0112404, + "auxiliary_loss_mlp": 0.01080669, + "balance_loss_clip": 1.0252347, + "balance_loss_mlp": 1.00094187, + "epoch": 0.23747970901220466, + "flos": 70546944821760.0, + "grad_norm": 0.8337575137970629, + "language_loss": 0.58298761, + "learning_rate": 3.5653267618302845e-06, + "loss": 0.60503465, + "num_input_tokens_seen": 42178730, + "step": 1975, + "time_per_iteration": 3.300309181213379 + }, + { + "auxiliary_loss_clip": 0.0115962, + "auxiliary_loss_mlp": 0.01087796, + "balance_loss_clip": 1.03911734, + "balance_loss_mlp": 1.00582767, + "epoch": 0.23759995190284375, + "flos": 20849807594880.0, + "grad_norm": 1.7558091065770896, + "language_loss": 0.86019468, + "learning_rate": 3.564841775947093e-06, + "loss": 0.88266885, + "num_input_tokens_seen": 42199620, + "step": 1976, + "time_per_iteration": 2.644585371017456 + }, + { + "auxiliary_loss_clip": 0.01132358, + "auxiliary_loss_mlp": 0.01092631, + "balance_loss_clip": 1.03538382, + "balance_loss_mlp": 1.01066267, + "epoch": 0.23772019479348283, + "flos": 32921645420160.0, + "grad_norm": 2.220822078550102, + "language_loss": 0.75849342, + "learning_rate": 3.5643565526786475e-06, + "loss": 0.78074336, + "num_input_tokens_seen": 42219560, + "step": 1977, + "time_per_iteration": 3.8045687675476074 + }, + { + "auxiliary_loss_clip": 0.0115886, + "auxiliary_loss_mlp": 0.01088182, + "balance_loss_clip": 1.03780246, + "balance_loss_mlp": 1.00630927, + "epoch": 0.2378404376841219, + "flos": 32342765834880.0, + "grad_norm": 1.7220729035443512, + "language_loss": 0.77305067, + "learning_rate": 3.5638710920985574e-06, + "loss": 0.79552108, + "num_input_tokens_seen": 42241020, + "step": 1978, + "time_per_iteration": 3.635270595550537 + }, + { + "auxiliary_loss_clip": 0.01147896, + "auxiliary_loss_mlp": 0.00874002, + "balance_loss_clip": 1.03443968, + "balance_loss_mlp": 0.99986744, + "epoch": 0.23796068057476102, + "flos": 22997624313600.0, + "grad_norm": 1.917067371990056, + "language_loss": 0.82203907, + "learning_rate": 3.5633853942804655e-06, + "loss": 0.84225804, + "num_input_tokens_seen": 42259345, + "step": 1979, + "time_per_iteration": 2.698744058609009 + }, + { + "auxiliary_loss_clip": 0.01130353, + "auxiliary_loss_mlp": 0.0109034, + "balance_loss_clip": 1.03366005, + "balance_loss_mlp": 1.00827646, + "epoch": 0.2380809234654001, + "flos": 13480938414720.0, + "grad_norm": 2.1923430601573517, + "language_loss": 0.76482248, + "learning_rate": 3.5628994592980527e-06, + "loss": 0.78702945, + "num_input_tokens_seen": 42277250, + "step": 1980, + "time_per_iteration": 2.7687602043151855 + }, + { + "auxiliary_loss_clip": 0.01158672, + "auxiliary_loss_mlp": 0.01088951, + "balance_loss_clip": 1.03731275, + "balance_loss_mlp": 1.00703025, + "epoch": 0.2382011663560392, + "flos": 16871803148160.0, + "grad_norm": 1.6495513278604097, + "language_loss": 0.70392126, + "learning_rate": 3.562413287225034e-06, + "loss": 0.72639751, + "num_input_tokens_seen": 42295360, + "step": 1981, + "time_per_iteration": 2.6307547092437744 + }, + { + "auxiliary_loss_clip": 0.01151265, + "auxiliary_loss_mlp": 0.01090738, + "balance_loss_clip": 1.03832531, + "balance_loss_mlp": 1.00876927, + "epoch": 0.2383214092466783, + "flos": 18441135331200.0, + "grad_norm": 2.3002181107583413, + "language_loss": 0.89380074, + "learning_rate": 3.5619268781351623e-06, + "loss": 0.91622078, + "num_input_tokens_seen": 42313430, + "step": 1982, + "time_per_iteration": 2.733344078063965 + }, + { + "auxiliary_loss_clip": 0.01132294, + "auxiliary_loss_mlp": 0.01091447, + "balance_loss_clip": 1.03393507, + "balance_loss_mlp": 1.00976527, + "epoch": 0.23844165213731738, + "flos": 19755717281280.0, + "grad_norm": 1.745167911131081, + "language_loss": 0.76744258, + "learning_rate": 3.5614402321022256e-06, + "loss": 0.78968, + "num_input_tokens_seen": 42331260, + "step": 1983, + "time_per_iteration": 2.738431692123413 + }, + { + "auxiliary_loss_clip": 0.01117361, + "auxiliary_loss_mlp": 0.01088399, + "balance_loss_clip": 1.03397322, + "balance_loss_mlp": 1.00666881, + "epoch": 0.23856189502795647, + "flos": 23367360960000.0, + "grad_norm": 2.075473723671637, + "language_loss": 0.87315279, + "learning_rate": 3.5609533492000463e-06, + "loss": 0.89521039, + "num_input_tokens_seen": 42350150, + "step": 1984, + "time_per_iteration": 2.8304591178894043 + }, + { + "auxiliary_loss_clip": 0.01136482, + "auxiliary_loss_mlp": 0.01089275, + "balance_loss_clip": 1.03574121, + "balance_loss_mlp": 1.00745022, + "epoch": 0.23868213791859555, + "flos": 23475056912640.0, + "grad_norm": 2.144311167259109, + "language_loss": 0.7841289, + "learning_rate": 3.560466229502485e-06, + "loss": 0.80638653, + "num_input_tokens_seen": 42369495, + "step": 1985, + "time_per_iteration": 2.76664137840271 + }, + { + "auxiliary_loss_clip": 0.0112919, + "auxiliary_loss_mlp": 0.00873987, + "balance_loss_clip": 1.03389645, + "balance_loss_mlp": 0.99993658, + "epoch": 0.23880238080923466, + "flos": 16617340224000.0, + "grad_norm": 2.7214081844754174, + "language_loss": 0.90125442, + "learning_rate": 3.5599788730834384e-06, + "loss": 0.92128623, + "num_input_tokens_seen": 42387455, + "step": 1986, + "time_per_iteration": 2.7241647243499756 + }, + { + "auxiliary_loss_clip": 0.0114927, + "auxiliary_loss_mlp": 0.01089585, + "balance_loss_clip": 1.03650486, + "balance_loss_mlp": 1.0077126, + "epoch": 0.23892262369987374, + "flos": 17348409734400.0, + "grad_norm": 2.5121059415865363, + "language_loss": 0.78460133, + "learning_rate": 3.559491280016836e-06, + "loss": 0.80698991, + "num_input_tokens_seen": 42405400, + "step": 1987, + "time_per_iteration": 2.709592580795288 + }, + { + "auxiliary_loss_clip": 0.01135512, + "auxiliary_loss_mlp": 0.01088511, + "balance_loss_clip": 1.03306317, + "balance_loss_mlp": 1.00659037, + "epoch": 0.23904286659051283, + "flos": 22309899540480.0, + "grad_norm": 1.7078867399713924, + "language_loss": 0.71121967, + "learning_rate": 3.5590034503766465e-06, + "loss": 0.73345983, + "num_input_tokens_seen": 42425065, + "step": 1988, + "time_per_iteration": 2.704395055770874 + }, + { + "auxiliary_loss_clip": 0.0115933, + "auxiliary_loss_mlp": 0.01091152, + "balance_loss_clip": 1.03850293, + "balance_loss_mlp": 1.00927913, + "epoch": 0.23916310948115194, + "flos": 21178246579200.0, + "grad_norm": 3.2860108894088658, + "language_loss": 0.80924058, + "learning_rate": 3.558515384236874e-06, + "loss": 0.83174545, + "num_input_tokens_seen": 42442495, + "step": 1989, + "time_per_iteration": 2.714933395385742 + }, + { + "auxiliary_loss_clip": 0.01119858, + "auxiliary_loss_mlp": 0.00874007, + "balance_loss_clip": 1.03217685, + "balance_loss_mlp": 0.99995989, + "epoch": 0.23928335237179102, + "flos": 14137349506560.0, + "grad_norm": 1.655506712517902, + "language_loss": 0.84005463, + "learning_rate": 3.558027081671556e-06, + "loss": 0.85999334, + "num_input_tokens_seen": 42459480, + "step": 1990, + "time_per_iteration": 2.7807137966156006 + }, + { + "auxiliary_loss_clip": 0.011512, + "auxiliary_loss_mlp": 0.01089553, + "balance_loss_clip": 1.03786016, + "balance_loss_mlp": 1.00753748, + "epoch": 0.2394035952624301, + "flos": 23769596436480.0, + "grad_norm": 2.8564962454872878, + "language_loss": 0.68851662, + "learning_rate": 3.557538542754769e-06, + "loss": 0.71092415, + "num_input_tokens_seen": 42479175, + "step": 1991, + "time_per_iteration": 2.7827162742614746 + }, + { + "auxiliary_loss_clip": 0.01158565, + "auxiliary_loss_mlp": 0.01089777, + "balance_loss_clip": 1.03805637, + "balance_loss_mlp": 1.00795174, + "epoch": 0.2395238381530692, + "flos": 24206198250240.0, + "grad_norm": 1.681074677351511, + "language_loss": 0.66733044, + "learning_rate": 3.557049767560623e-06, + "loss": 0.68981385, + "num_input_tokens_seen": 42498090, + "step": 1992, + "time_per_iteration": 2.7670235633850098 + }, + { + "auxiliary_loss_clip": 0.01119192, + "auxiliary_loss_mlp": 0.01087965, + "balance_loss_clip": 1.03450871, + "balance_loss_mlp": 1.00609231, + "epoch": 0.2396440810437083, + "flos": 25295763450240.0, + "grad_norm": 1.9946079944155277, + "language_loss": 0.86046499, + "learning_rate": 3.5565607561632655e-06, + "loss": 0.88253665, + "num_input_tokens_seen": 42516930, + "step": 1993, + "time_per_iteration": 2.9638614654541016 + }, + { + "auxiliary_loss_clip": 0.0113334, + "auxiliary_loss_mlp": 0.010873, + "balance_loss_clip": 1.03435612, + "balance_loss_mlp": 1.00537896, + "epoch": 0.23976432393434738, + "flos": 28543093436160.0, + "grad_norm": 3.096271654168901, + "language_loss": 0.79614484, + "learning_rate": 3.5560715086368787e-06, + "loss": 0.81835127, + "num_input_tokens_seen": 42534800, + "step": 1994, + "time_per_iteration": 2.8099465370178223 + }, + { + "auxiliary_loss_clip": 0.01132599, + "auxiliary_loss_mlp": 0.01087466, + "balance_loss_clip": 1.03395128, + "balance_loss_mlp": 1.00568795, + "epoch": 0.23988456682498646, + "flos": 19494358945920.0, + "grad_norm": 1.9253944515147015, + "language_loss": 0.82274365, + "learning_rate": 3.5555820250556816e-06, + "loss": 0.84494424, + "num_input_tokens_seen": 42552000, + "step": 1995, + "time_per_iteration": 2.750185012817383 + }, + { + "auxiliary_loss_clip": 0.01132679, + "auxiliary_loss_mlp": 0.01089344, + "balance_loss_clip": 1.03722739, + "balance_loss_mlp": 1.00742316, + "epoch": 0.24000480971562557, + "flos": 20266331068800.0, + "grad_norm": 2.5703200263453203, + "language_loss": 0.69745111, + "learning_rate": 3.5550923054939278e-06, + "loss": 0.71967137, + "num_input_tokens_seen": 42571455, + "step": 1996, + "time_per_iteration": 3.714486837387085 + }, + { + "auxiliary_loss_clip": 0.01107643, + "auxiliary_loss_mlp": 0.01088508, + "balance_loss_clip": 1.03041923, + "balance_loss_mlp": 1.00692117, + "epoch": 0.24012505260626466, + "flos": 25443176866560.0, + "grad_norm": 2.0842468109981045, + "language_loss": 0.74614215, + "learning_rate": 3.5546023500259083e-06, + "loss": 0.7681036, + "num_input_tokens_seen": 42592550, + "step": 1997, + "time_per_iteration": 2.9089858531951904 + }, + { + "auxiliary_loss_clip": 0.01121746, + "auxiliary_loss_mlp": 0.01087926, + "balance_loss_clip": 1.03470469, + "balance_loss_mlp": 1.0061965, + "epoch": 0.24024529549690374, + "flos": 15553342529280.0, + "grad_norm": 1.875791722485031, + "language_loss": 0.80446726, + "learning_rate": 3.5541121587259477e-06, + "loss": 0.82656395, + "num_input_tokens_seen": 42610385, + "step": 1998, + "time_per_iteration": 2.8308606147766113 + }, + { + "auxiliary_loss_clip": 0.0112524, + "auxiliary_loss_mlp": 0.01080693, + "balance_loss_clip": 1.03653491, + "balance_loss_mlp": 1.00096595, + "epoch": 0.24036553838754285, + "flos": 57122351867520.0, + "grad_norm": 0.8348953103122653, + "language_loss": 0.57874262, + "learning_rate": 3.553621731668408e-06, + "loss": 0.60080194, + "num_input_tokens_seen": 42673595, + "step": 1999, + "time_per_iteration": 4.282290935516357 + }, + { + "auxiliary_loss_clip": 0.0114944, + "auxiliary_loss_mlp": 0.01088742, + "balance_loss_clip": 1.03652692, + "balance_loss_mlp": 1.00677419, + "epoch": 0.24048578127818193, + "flos": 24969946158720.0, + "grad_norm": 1.7555193491308758, + "language_loss": 0.83073819, + "learning_rate": 3.553131068927688e-06, + "loss": 0.85311997, + "num_input_tokens_seen": 42692000, + "step": 2000, + "time_per_iteration": 2.766360282897949 + }, + { + "auxiliary_loss_clip": 0.01127766, + "auxiliary_loss_mlp": 0.01088118, + "balance_loss_clip": 1.03520727, + "balance_loss_mlp": 1.00643599, + "epoch": 0.24060602416882101, + "flos": 23330947547520.0, + "grad_norm": 2.0152642250244344, + "language_loss": 0.808707, + "learning_rate": 3.552640170578219e-06, + "loss": 0.83086586, + "num_input_tokens_seen": 42712250, + "step": 2001, + "time_per_iteration": 2.7801930904388428 + }, + { + "auxiliary_loss_clip": 0.0113134, + "auxiliary_loss_mlp": 0.01088876, + "balance_loss_clip": 1.03398776, + "balance_loss_mlp": 1.00719333, + "epoch": 0.2407262670594601, + "flos": 14173260128640.0, + "grad_norm": 2.1702118437850615, + "language_loss": 0.78432679, + "learning_rate": 3.5521490366944703e-06, + "loss": 0.80652893, + "num_input_tokens_seen": 42729900, + "step": 2002, + "time_per_iteration": 3.6476025581359863 + }, + { + "auxiliary_loss_clip": 0.01127598, + "auxiliary_loss_mlp": 0.01090474, + "balance_loss_clip": 1.03410912, + "balance_loss_mlp": 1.00874424, + "epoch": 0.2408465099500992, + "flos": 13663113217920.0, + "grad_norm": 2.1219183918206186, + "language_loss": 0.79710913, + "learning_rate": 3.5516576673509474e-06, + "loss": 0.8192898, + "num_input_tokens_seen": 42747900, + "step": 2003, + "time_per_iteration": 2.7797253131866455 + }, + { + "auxiliary_loss_clip": 0.01158382, + "auxiliary_loss_mlp": 0.01090636, + "balance_loss_clip": 1.03776956, + "balance_loss_mlp": 1.00876331, + "epoch": 0.2409667528407383, + "flos": 31248029076480.0, + "grad_norm": 1.808131100956257, + "language_loss": 0.86141729, + "learning_rate": 3.5511660626221896e-06, + "loss": 0.8839075, + "num_input_tokens_seen": 42768540, + "step": 2004, + "time_per_iteration": 3.7217767238616943 + }, + { + "auxiliary_loss_clip": 0.01134701, + "auxiliary_loss_mlp": 0.00873949, + "balance_loss_clip": 1.03312457, + "balance_loss_mlp": 0.99993384, + "epoch": 0.24108699573137737, + "flos": 22199941031040.0, + "grad_norm": 2.10516081521605, + "language_loss": 0.89411259, + "learning_rate": 3.5506742225827744e-06, + "loss": 0.91419911, + "num_input_tokens_seen": 42785395, + "step": 2005, + "time_per_iteration": 2.7610960006713867 + }, + { + "auxiliary_loss_clip": 0.01127278, + "auxiliary_loss_mlp": 0.01087153, + "balance_loss_clip": 1.03453147, + "balance_loss_mlp": 1.00547075, + "epoch": 0.24120723862201648, + "flos": 26103035664000.0, + "grad_norm": 1.9932695611187912, + "language_loss": 0.90274298, + "learning_rate": 3.5501821473073116e-06, + "loss": 0.9248873, + "num_input_tokens_seen": 42801980, + "step": 2006, + "time_per_iteration": 2.77410626411438 + }, + { + "auxiliary_loss_clip": 0.01120377, + "auxiliary_loss_mlp": 0.01087712, + "balance_loss_clip": 1.02872372, + "balance_loss_mlp": 1.00602961, + "epoch": 0.24132748151265557, + "flos": 18624926246400.0, + "grad_norm": 1.9719013319578973, + "language_loss": 0.8660081, + "learning_rate": 3.54968983687045e-06, + "loss": 0.888089, + "num_input_tokens_seen": 42818850, + "step": 2007, + "time_per_iteration": 2.8365731239318848 + }, + { + "auxiliary_loss_clip": 0.01133211, + "auxiliary_loss_mlp": 0.01089746, + "balance_loss_clip": 1.03138208, + "balance_loss_mlp": 1.00787282, + "epoch": 0.24144772440329465, + "flos": 15267673664640.0, + "grad_norm": 2.3953144319424844, + "language_loss": 0.89063287, + "learning_rate": 3.549197291346872e-06, + "loss": 0.91286242, + "num_input_tokens_seen": 42835375, + "step": 2008, + "time_per_iteration": 2.6815271377563477 + }, + { + "auxiliary_loss_clip": 0.01151484, + "auxiliary_loss_mlp": 0.01089312, + "balance_loss_clip": 1.03918338, + "balance_loss_mlp": 1.00748682, + "epoch": 0.24156796729393373, + "flos": 24024274842240.0, + "grad_norm": 2.527464090447806, + "language_loss": 0.79455, + "learning_rate": 3.548704510811297e-06, + "loss": 0.81695795, + "num_input_tokens_seen": 42854570, + "step": 2009, + "time_per_iteration": 2.737199306488037 + }, + { + "auxiliary_loss_clip": 0.01108994, + "auxiliary_loss_mlp": 0.01088961, + "balance_loss_clip": 1.03329575, + "balance_loss_mlp": 1.00699258, + "epoch": 0.24168821018457284, + "flos": 26286790665600.0, + "grad_norm": 2.14319857584711, + "language_loss": 0.74436867, + "learning_rate": 3.5482114953384787e-06, + "loss": 0.76634824, + "num_input_tokens_seen": 42873800, + "step": 2010, + "time_per_iteration": 2.9062845706939697 + }, + { + "auxiliary_loss_clip": 0.01149673, + "auxiliary_loss_mlp": 0.01088148, + "balance_loss_clip": 1.0374161, + "balance_loss_mlp": 1.00651312, + "epoch": 0.24180845307521193, + "flos": 18223193560320.0, + "grad_norm": 2.185239164836191, + "language_loss": 0.84512675, + "learning_rate": 3.5477182450032077e-06, + "loss": 0.86750495, + "num_input_tokens_seen": 42892400, + "step": 2011, + "time_per_iteration": 2.78218412399292 + }, + { + "auxiliary_loss_clip": 0.0114763, + "auxiliary_loss_mlp": 0.01088603, + "balance_loss_clip": 1.03629839, + "balance_loss_mlp": 1.00677824, + "epoch": 0.241928695965851, + "flos": 20449260057600.0, + "grad_norm": 2.4053308329327705, + "language_loss": 0.8375982, + "learning_rate": 3.5472247598803097e-06, + "loss": 0.85996044, + "num_input_tokens_seen": 42911745, + "step": 2012, + "time_per_iteration": 2.769178867340088 + }, + { + "auxiliary_loss_clip": 0.01157233, + "auxiliary_loss_mlp": 0.01089229, + "balance_loss_clip": 1.03645492, + "balance_loss_mlp": 1.00730872, + "epoch": 0.24204893885649012, + "flos": 25556475340800.0, + "grad_norm": 2.196833407165181, + "language_loss": 0.85399628, + "learning_rate": 3.546731040044645e-06, + "loss": 0.87646091, + "num_input_tokens_seen": 42926915, + "step": 2013, + "time_per_iteration": 2.6690492630004883 + }, + { + "auxiliary_loss_clip": 0.0115829, + "auxiliary_loss_mlp": 0.0108916, + "balance_loss_clip": 1.03776538, + "balance_loss_mlp": 1.00747776, + "epoch": 0.2421691817471292, + "flos": 30660207004800.0, + "grad_norm": 1.7274589374957092, + "language_loss": 0.75418317, + "learning_rate": 3.546237085571112e-06, + "loss": 0.7766577, + "num_input_tokens_seen": 42945350, + "step": 2014, + "time_per_iteration": 2.791658401489258 + }, + { + "auxiliary_loss_clip": 0.01148813, + "auxiliary_loss_mlp": 0.0108844, + "balance_loss_clip": 1.03698409, + "balance_loss_mlp": 1.00661445, + "epoch": 0.24228942463776829, + "flos": 21945011230080.0, + "grad_norm": 2.3669177500948986, + "language_loss": 0.72328794, + "learning_rate": 3.5457428965346425e-06, + "loss": 0.74566054, + "num_input_tokens_seen": 42964290, + "step": 2015, + "time_per_iteration": 2.7176458835601807 + }, + { + "auxiliary_loss_clip": 0.01109952, + "auxiliary_loss_mlp": 0.01088213, + "balance_loss_clip": 1.03462732, + "balance_loss_mlp": 1.00643563, + "epoch": 0.2424096675284074, + "flos": 33984493879680.0, + "grad_norm": 1.6418502021391213, + "language_loss": 0.74455976, + "learning_rate": 3.545248473010205e-06, + "loss": 0.76654148, + "num_input_tokens_seen": 42987095, + "step": 2016, + "time_per_iteration": 2.938028335571289 + }, + { + "auxiliary_loss_clip": 0.01158366, + "auxiliary_loss_mlp": 0.00874002, + "balance_loss_clip": 1.03729534, + "balance_loss_mlp": 0.99994111, + "epoch": 0.24252991041904648, + "flos": 21653416621440.0, + "grad_norm": 1.7747947494785876, + "language_loss": 0.87487853, + "learning_rate": 3.544753815072802e-06, + "loss": 0.89520216, + "num_input_tokens_seen": 43005750, + "step": 2017, + "time_per_iteration": 2.6769325733184814 + }, + { + "auxiliary_loss_clip": 0.01093632, + "auxiliary_loss_mlp": 0.01088509, + "balance_loss_clip": 1.02853405, + "balance_loss_mlp": 1.00673103, + "epoch": 0.24265015330968556, + "flos": 21870065502720.0, + "grad_norm": 1.938987439200498, + "language_loss": 0.88043517, + "learning_rate": 3.544258922797474e-06, + "loss": 0.90225661, + "num_input_tokens_seen": 43023870, + "step": 2018, + "time_per_iteration": 2.9683048725128174 + }, + { + "auxiliary_loss_clip": 0.01159549, + "auxiliary_loss_mlp": 0.0108938, + "balance_loss_clip": 1.03841329, + "balance_loss_mlp": 1.00774586, + "epoch": 0.24277039620032465, + "flos": 25628260671360.0, + "grad_norm": 1.5305259744995425, + "language_loss": 0.7811048, + "learning_rate": 3.543763796259295e-06, + "loss": 0.80359411, + "num_input_tokens_seen": 43043825, + "step": 2019, + "time_per_iteration": 2.7183244228363037 + }, + { + "auxiliary_loss_clip": 0.01146964, + "auxiliary_loss_mlp": 0.01087577, + "balance_loss_clip": 1.03582144, + "balance_loss_mlp": 1.00579882, + "epoch": 0.24289063909096376, + "flos": 26286575184000.0, + "grad_norm": 2.2931175310375043, + "language_loss": 0.91030383, + "learning_rate": 3.5432684355333754e-06, + "loss": 0.93264914, + "num_input_tokens_seen": 43062480, + "step": 2020, + "time_per_iteration": 2.925036668777466 + }, + { + "auxiliary_loss_clip": 0.01148776, + "auxiliary_loss_mlp": 0.01089152, + "balance_loss_clip": 1.03627551, + "balance_loss_mlp": 1.00747013, + "epoch": 0.24301088198160284, + "flos": 25075056332160.0, + "grad_norm": 2.6748127264856953, + "language_loss": 0.7658428, + "learning_rate": 3.5427728406948613e-06, + "loss": 0.78822207, + "num_input_tokens_seen": 43081595, + "step": 2021, + "time_per_iteration": 3.720712423324585 + }, + { + "auxiliary_loss_clip": 0.01132549, + "auxiliary_loss_mlp": 0.01081259, + "balance_loss_clip": 1.04072297, + "balance_loss_mlp": 1.00153184, + "epoch": 0.24313112487224192, + "flos": 69900948673920.0, + "grad_norm": 0.7648371442507012, + "language_loss": 0.5787797, + "learning_rate": 3.542277011818934e-06, + "loss": 0.60091776, + "num_input_tokens_seen": 43145430, + "step": 2022, + "time_per_iteration": 3.4417006969451904 + }, + { + "auxiliary_loss_clip": 0.01131503, + "auxiliary_loss_mlp": 0.01089933, + "balance_loss_clip": 1.03909326, + "balance_loss_mlp": 1.00844169, + "epoch": 0.24325136776288103, + "flos": 40662334235520.0, + "grad_norm": 1.9626869885822729, + "language_loss": 0.74126595, + "learning_rate": 3.5417809489808104e-06, + "loss": 0.76348031, + "num_input_tokens_seen": 43167040, + "step": 2023, + "time_per_iteration": 2.9357974529266357 + }, + { + "auxiliary_loss_clip": 0.01149772, + "auxiliary_loss_mlp": 0.01089455, + "balance_loss_clip": 1.03750801, + "balance_loss_mlp": 1.00772476, + "epoch": 0.24337161065352012, + "flos": 25046400257280.0, + "grad_norm": 1.7181118202060155, + "language_loss": 0.72464573, + "learning_rate": 3.5412846522557422e-06, + "loss": 0.74703801, + "num_input_tokens_seen": 43187930, + "step": 2024, + "time_per_iteration": 2.7732932567596436 + }, + { + "auxiliary_loss_clip": 0.01160026, + "auxiliary_loss_mlp": 0.0108999, + "balance_loss_clip": 1.03968954, + "balance_loss_mlp": 1.00821245, + "epoch": 0.2434918535441592, + "flos": 18661160090880.0, + "grad_norm": 2.180039315296447, + "language_loss": 0.74136889, + "learning_rate": 3.540788121719018e-06, + "loss": 0.76386905, + "num_input_tokens_seen": 43206350, + "step": 2025, + "time_per_iteration": 3.649993419647217 + }, + { + "auxiliary_loss_clip": 0.0111961, + "auxiliary_loss_mlp": 0.01088943, + "balance_loss_clip": 1.02905142, + "balance_loss_mlp": 1.00716567, + "epoch": 0.24361209643479828, + "flos": 23915142345600.0, + "grad_norm": 1.7725387192942987, + "language_loss": 0.82118559, + "learning_rate": 3.5402913574459604e-06, + "loss": 0.84327114, + "num_input_tokens_seen": 43226255, + "step": 2026, + "time_per_iteration": 2.8329687118530273 + }, + { + "auxiliary_loss_clip": 0.011038, + "auxiliary_loss_mlp": 0.01088732, + "balance_loss_clip": 1.03434086, + "balance_loss_mlp": 1.0071454, + "epoch": 0.2437323393254374, + "flos": 28657505232000.0, + "grad_norm": 2.078502178238336, + "language_loss": 0.86065805, + "learning_rate": 3.5397943595119297e-06, + "loss": 0.88258338, + "num_input_tokens_seen": 43247675, + "step": 2027, + "time_per_iteration": 2.9270410537719727 + }, + { + "auxiliary_loss_clip": 0.01134206, + "auxiliary_loss_mlp": 0.01087724, + "balance_loss_clip": 1.03555083, + "balance_loss_mlp": 1.00594616, + "epoch": 0.24385258221607647, + "flos": 23550325862400.0, + "grad_norm": 2.733435262599775, + "language_loss": 0.78100693, + "learning_rate": 3.5392971279923177e-06, + "loss": 0.80322623, + "num_input_tokens_seen": 43265895, + "step": 2028, + "time_per_iteration": 3.75882625579834 + }, + { + "auxiliary_loss_clip": 0.01128973, + "auxiliary_loss_mlp": 0.01090154, + "balance_loss_clip": 1.03554368, + "balance_loss_mlp": 1.00828135, + "epoch": 0.24397282510671556, + "flos": 25336091445120.0, + "grad_norm": 2.097614037148584, + "language_loss": 0.83143139, + "learning_rate": 3.5387996629625557e-06, + "loss": 0.85362267, + "num_input_tokens_seen": 43283485, + "step": 2029, + "time_per_iteration": 2.867603302001953 + }, + { + "auxiliary_loss_clip": 0.01151851, + "auxiliary_loss_mlp": 0.01080356, + "balance_loss_clip": 1.04376125, + "balance_loss_mlp": 1.00062847, + "epoch": 0.24409306799735467, + "flos": 65187421430400.0, + "grad_norm": 0.8039358387078157, + "language_loss": 0.54999697, + "learning_rate": 3.5383019644981083e-06, + "loss": 0.57231903, + "num_input_tokens_seen": 43347180, + "step": 2030, + "time_per_iteration": 4.167879104614258 + }, + { + "auxiliary_loss_clip": 0.01132424, + "auxiliary_loss_mlp": 0.01088906, + "balance_loss_clip": 1.03370106, + "balance_loss_mlp": 1.00703263, + "epoch": 0.24421331088799375, + "flos": 19537093152000.0, + "grad_norm": 2.063835745746, + "language_loss": 0.72697628, + "learning_rate": 3.5378040326744763e-06, + "loss": 0.74918956, + "num_input_tokens_seen": 43366665, + "step": 2031, + "time_per_iteration": 2.6997737884521484 + }, + { + "auxiliary_loss_clip": 0.01129257, + "auxiliary_loss_mlp": 0.01088458, + "balance_loss_clip": 1.03649426, + "balance_loss_mlp": 1.00677538, + "epoch": 0.24433355377863283, + "flos": 21068575378560.0, + "grad_norm": 5.416167107398701, + "language_loss": 0.85578901, + "learning_rate": 3.5373058675671946e-06, + "loss": 0.87796617, + "num_input_tokens_seen": 43384670, + "step": 2032, + "time_per_iteration": 2.810469627380371 + }, + { + "auxiliary_loss_clip": 0.01116298, + "auxiliary_loss_mlp": 0.01089022, + "balance_loss_clip": 1.03171813, + "balance_loss_mlp": 1.00714898, + "epoch": 0.24445379666927192, + "flos": 22637189289600.0, + "grad_norm": 1.828290857653142, + "language_loss": 0.72082925, + "learning_rate": 3.536807469251836e-06, + "loss": 0.74288249, + "num_input_tokens_seen": 43403825, + "step": 2033, + "time_per_iteration": 2.766618013381958 + }, + { + "auxiliary_loss_clip": 0.01123156, + "auxiliary_loss_mlp": 0.01089872, + "balance_loss_clip": 1.03582847, + "balance_loss_mlp": 1.00799942, + "epoch": 0.24457403955991103, + "flos": 21251612108160.0, + "grad_norm": 2.23639961015501, + "language_loss": 0.82932991, + "learning_rate": 3.5363088378040055e-06, + "loss": 0.85146022, + "num_input_tokens_seen": 43422715, + "step": 2034, + "time_per_iteration": 2.78422474861145 + }, + { + "auxiliary_loss_clip": 0.01151891, + "auxiliary_loss_mlp": 0.00873131, + "balance_loss_clip": 1.04385006, + "balance_loss_mlp": 0.99967062, + "epoch": 0.2446942824505501, + "flos": 66997820764800.0, + "grad_norm": 0.7578507681749371, + "language_loss": 0.64374745, + "learning_rate": 3.5358099732993463e-06, + "loss": 0.66399765, + "num_input_tokens_seen": 43481825, + "step": 2035, + "time_per_iteration": 3.1589620113372803 + }, + { + "auxiliary_loss_clip": 0.01140525, + "auxiliary_loss_mlp": 0.01089087, + "balance_loss_clip": 1.03684449, + "balance_loss_mlp": 1.00716686, + "epoch": 0.2448145253411892, + "flos": 20411122792320.0, + "grad_norm": 1.9540009075985119, + "language_loss": 0.89806581, + "learning_rate": 3.535310875813535e-06, + "loss": 0.920362, + "num_input_tokens_seen": 43500220, + "step": 2036, + "time_per_iteration": 2.7420146465301514 + }, + { + "auxiliary_loss_clip": 0.0114674, + "auxiliary_loss_mlp": 0.01088199, + "balance_loss_clip": 1.03601265, + "balance_loss_mlp": 1.00642157, + "epoch": 0.2449347682318283, + "flos": 28804739080320.0, + "grad_norm": 1.706456840572944, + "language_loss": 0.81432438, + "learning_rate": 3.5348115454222843e-06, + "loss": 0.8366738, + "num_input_tokens_seen": 43522805, + "step": 2037, + "time_per_iteration": 2.7523326873779297 + }, + { + "auxiliary_loss_clip": 0.01141267, + "auxiliary_loss_mlp": 0.01088762, + "balance_loss_clip": 1.03763914, + "balance_loss_mlp": 1.00679386, + "epoch": 0.2450550111224674, + "flos": 22528990546560.0, + "grad_norm": 1.718655959911645, + "language_loss": 0.86391509, + "learning_rate": 3.5343119822013425e-06, + "loss": 0.88621545, + "num_input_tokens_seen": 43541915, + "step": 2038, + "time_per_iteration": 2.7538115978240967 + }, + { + "auxiliary_loss_clip": 0.01150286, + "auxiliary_loss_mlp": 0.01090105, + "balance_loss_clip": 1.03793025, + "balance_loss_mlp": 1.00813663, + "epoch": 0.24517525401310647, + "flos": 21759137326080.0, + "grad_norm": 1.9839996246872351, + "language_loss": 0.77468938, + "learning_rate": 3.533812186226493e-06, + "loss": 0.79709327, + "num_input_tokens_seen": 43562625, + "step": 2039, + "time_per_iteration": 2.8000504970550537 + }, + { + "auxiliary_loss_clip": 0.01156918, + "auxiliary_loss_mlp": 0.01088171, + "balance_loss_clip": 1.0365504, + "balance_loss_mlp": 1.00658441, + "epoch": 0.24529549690374555, + "flos": 25043311687680.0, + "grad_norm": 1.7303847131063979, + "language_loss": 0.75978923, + "learning_rate": 3.5333121575735545e-06, + "loss": 0.78224015, + "num_input_tokens_seen": 43582265, + "step": 2040, + "time_per_iteration": 2.8267135620117188 + }, + { + "auxiliary_loss_clip": 0.0113735, + "auxiliary_loss_mlp": 0.01088193, + "balance_loss_clip": 1.03499293, + "balance_loss_mlp": 1.00655806, + "epoch": 0.24541573979438466, + "flos": 32123638915200.0, + "grad_norm": 1.8094883582032666, + "language_loss": 0.755642, + "learning_rate": 3.532811896318381e-06, + "loss": 0.77789736, + "num_input_tokens_seen": 43604335, + "step": 2041, + "time_per_iteration": 2.8273024559020996 + }, + { + "auxiliary_loss_clip": 0.01125937, + "auxiliary_loss_mlp": 0.01089963, + "balance_loss_clip": 1.03312826, + "balance_loss_mlp": 1.00789928, + "epoch": 0.24553598268502375, + "flos": 31357556622720.0, + "grad_norm": 2.5997246696485585, + "language_loss": 0.81598496, + "learning_rate": 3.5323114025368615e-06, + "loss": 0.83814394, + "num_input_tokens_seen": 43619400, + "step": 2042, + "time_per_iteration": 2.8920671939849854 + }, + { + "auxiliary_loss_clip": 0.0114816, + "auxiliary_loss_mlp": 0.01089175, + "balance_loss_clip": 1.03518498, + "balance_loss_mlp": 1.00739765, + "epoch": 0.24565622557566283, + "flos": 14027462824320.0, + "grad_norm": 2.599247496622039, + "language_loss": 0.8181361, + "learning_rate": 3.53181067630492e-06, + "loss": 0.84050941, + "num_input_tokens_seen": 43636870, + "step": 2043, + "time_per_iteration": 2.687145471572876 + }, + { + "auxiliary_loss_clip": 0.01136779, + "auxiliary_loss_mlp": 0.01089651, + "balance_loss_clip": 1.03567839, + "balance_loss_mlp": 1.00777841, + "epoch": 0.24577646846630194, + "flos": 16581465515520.0, + "grad_norm": 2.0612316195288973, + "language_loss": 0.76036036, + "learning_rate": 3.5313097176985175e-06, + "loss": 0.78262472, + "num_input_tokens_seen": 43655180, + "step": 2044, + "time_per_iteration": 2.7520813941955566 + }, + { + "auxiliary_loss_clip": 0.01148149, + "auxiliary_loss_mlp": 0.01086984, + "balance_loss_clip": 1.0367285, + "balance_loss_mlp": 1.00534987, + "epoch": 0.24589671135694102, + "flos": 18807424272000.0, + "grad_norm": 1.780859806006879, + "language_loss": 0.80949557, + "learning_rate": 3.5308085267936482e-06, + "loss": 0.83184689, + "num_input_tokens_seen": 43672895, + "step": 2045, + "time_per_iteration": 2.712055206298828 + }, + { + "auxiliary_loss_clip": 0.01100203, + "auxiliary_loss_mlp": 0.0087392, + "balance_loss_clip": 1.03213465, + "balance_loss_mlp": 0.99994016, + "epoch": 0.2460169542475801, + "flos": 19938538529280.0, + "grad_norm": 1.7331915272315292, + "language_loss": 0.89772671, + "learning_rate": 3.530307103666342e-06, + "loss": 0.91746795, + "num_input_tokens_seen": 43691975, + "step": 2046, + "time_per_iteration": 2.928867816925049 + }, + { + "auxiliary_loss_clip": 0.01120737, + "auxiliary_loss_mlp": 0.01089234, + "balance_loss_clip": 1.03466272, + "balance_loss_mlp": 1.00726604, + "epoch": 0.24613719713821922, + "flos": 24171221381760.0, + "grad_norm": 1.6135227122880405, + "language_loss": 0.79950595, + "learning_rate": 3.5298054483926658e-06, + "loss": 0.82160568, + "num_input_tokens_seen": 43712670, + "step": 2047, + "time_per_iteration": 3.6986515522003174 + }, + { + "auxiliary_loss_clip": 0.01150524, + "auxiliary_loss_mlp": 0.01090052, + "balance_loss_clip": 1.03798127, + "balance_loss_mlp": 1.00798821, + "epoch": 0.2462574400288583, + "flos": 30221055325440.0, + "grad_norm": 1.8997237408272738, + "language_loss": 0.82681835, + "learning_rate": 3.5293035610487187e-06, + "loss": 0.84922409, + "num_input_tokens_seen": 43732035, + "step": 2048, + "time_per_iteration": 2.7914505004882812 + }, + { + "auxiliary_loss_clip": 0.01121661, + "auxiliary_loss_mlp": 0.01082331, + "balance_loss_clip": 1.04111338, + "balance_loss_mlp": 1.00260341, + "epoch": 0.24637768291949738, + "flos": 68943030819840.0, + "grad_norm": 0.7406185265616213, + "language_loss": 0.61999637, + "learning_rate": 3.5288014417106374e-06, + "loss": 0.64203632, + "num_input_tokens_seen": 43798055, + "step": 2049, + "time_per_iteration": 3.3220174312591553 + }, + { + "auxiliary_loss_clip": 0.01124547, + "auxiliary_loss_mlp": 0.01089913, + "balance_loss_clip": 1.03192997, + "balance_loss_mlp": 1.00789738, + "epoch": 0.24649792581013646, + "flos": 34383999922560.0, + "grad_norm": 1.8383294226899372, + "language_loss": 0.75439256, + "learning_rate": 3.528299090454593e-06, + "loss": 0.77653712, + "num_input_tokens_seen": 43818590, + "step": 2050, + "time_per_iteration": 2.915371894836426 + }, + { + "auxiliary_loss_clip": 0.01149439, + "auxiliary_loss_mlp": 0.01088612, + "balance_loss_clip": 1.0371685, + "balance_loss_mlp": 1.00688171, + "epoch": 0.24661816870077558, + "flos": 19680448331520.0, + "grad_norm": 3.9056233637821736, + "language_loss": 0.82756072, + "learning_rate": 3.527796507356792e-06, + "loss": 0.84994119, + "num_input_tokens_seen": 43832480, + "step": 2051, + "time_per_iteration": 3.69399356842041 + }, + { + "auxiliary_loss_clip": 0.01141096, + "auxiliary_loss_mlp": 0.01088439, + "balance_loss_clip": 1.03747916, + "balance_loss_mlp": 1.00661361, + "epoch": 0.24673841159141466, + "flos": 20002279213440.0, + "grad_norm": 2.3890361591652676, + "language_loss": 0.89991188, + "learning_rate": 3.527293692493475e-06, + "loss": 0.92220724, + "num_input_tokens_seen": 43848345, + "step": 2052, + "time_per_iteration": 3.6081204414367676 + }, + { + "auxiliary_loss_clip": 0.01149077, + "auxiliary_loss_mlp": 0.01087496, + "balance_loss_clip": 1.03661716, + "balance_loss_mlp": 1.00562334, + "epoch": 0.24685865448205374, + "flos": 21646593037440.0, + "grad_norm": 2.589581124034318, + "language_loss": 0.73423463, + "learning_rate": 3.52679064594092e-06, + "loss": 0.75660032, + "num_input_tokens_seen": 43865685, + "step": 2053, + "time_per_iteration": 2.684661388397217 + }, + { + "auxiliary_loss_clip": 0.01121077, + "auxiliary_loss_mlp": 0.01088369, + "balance_loss_clip": 1.03268576, + "balance_loss_mlp": 1.00673485, + "epoch": 0.24697889737269285, + "flos": 17960470508160.0, + "grad_norm": 1.8614020345250697, + "language_loss": 0.75044572, + "learning_rate": 3.5262873677754375e-06, + "loss": 0.77254015, + "num_input_tokens_seen": 43883690, + "step": 2054, + "time_per_iteration": 2.832274913787842 + }, + { + "auxiliary_loss_clip": 0.01157969, + "auxiliary_loss_mlp": 0.01089912, + "balance_loss_clip": 1.03761673, + "balance_loss_mlp": 1.00813484, + "epoch": 0.24709914026333193, + "flos": 27344611221120.0, + "grad_norm": 1.8041779920198224, + "language_loss": 0.8039211, + "learning_rate": 3.5257838580733745e-06, + "loss": 0.82639992, + "num_input_tokens_seen": 43903295, + "step": 2055, + "time_per_iteration": 3.72107195854187 + }, + { + "auxiliary_loss_clip": 0.01149437, + "auxiliary_loss_mlp": 0.01087619, + "balance_loss_clip": 1.03716779, + "balance_loss_mlp": 1.00584126, + "epoch": 0.24721938315397102, + "flos": 19275519335040.0, + "grad_norm": 1.8522572771932055, + "language_loss": 0.87369221, + "learning_rate": 3.5252801169111138e-06, + "loss": 0.89606279, + "num_input_tokens_seen": 43920960, + "step": 2056, + "time_per_iteration": 2.6943023204803467 + }, + { + "auxiliary_loss_clip": 0.01135076, + "auxiliary_loss_mlp": 0.01090055, + "balance_loss_clip": 1.03494537, + "balance_loss_mlp": 1.0082773, + "epoch": 0.2473396260446101, + "flos": 23185796688000.0, + "grad_norm": 2.5565585047392285, + "language_loss": 0.79594243, + "learning_rate": 3.524776144365072e-06, + "loss": 0.81819379, + "num_input_tokens_seen": 43939415, + "step": 2057, + "time_per_iteration": 2.783850908279419 + }, + { + "auxiliary_loss_clip": 0.01134431, + "auxiliary_loss_mlp": 0.01089027, + "balance_loss_clip": 1.03628218, + "balance_loss_mlp": 1.00734472, + "epoch": 0.2474598689352492, + "flos": 21142443697920.0, + "grad_norm": 1.769821200793656, + "language_loss": 0.79290622, + "learning_rate": 3.5242719405117016e-06, + "loss": 0.81514078, + "num_input_tokens_seen": 43959220, + "step": 2058, + "time_per_iteration": 2.7396934032440186 + }, + { + "auxiliary_loss_clip": 0.01131151, + "auxiliary_loss_mlp": 0.00873974, + "balance_loss_clip": 1.03017831, + "balance_loss_mlp": 0.99995482, + "epoch": 0.2475801118258883, + "flos": 21648352803840.0, + "grad_norm": 4.959030179911586, + "language_loss": 0.74682581, + "learning_rate": 3.5237675054274893e-06, + "loss": 0.766877, + "num_input_tokens_seen": 43978420, + "step": 2059, + "time_per_iteration": 2.776644468307495 + }, + { + "auxiliary_loss_clip": 0.01149685, + "auxiliary_loss_mlp": 0.01089699, + "balance_loss_clip": 1.03773868, + "balance_loss_mlp": 1.00787413, + "epoch": 0.24770035471652738, + "flos": 22674500542080.0, + "grad_norm": 1.8154157132654787, + "language_loss": 0.80283731, + "learning_rate": 3.5232628391889584e-06, + "loss": 0.82523113, + "num_input_tokens_seen": 43996710, + "step": 2060, + "time_per_iteration": 2.7483465671539307 + }, + { + "auxiliary_loss_clip": 0.01113228, + "auxiliary_loss_mlp": 0.01088467, + "balance_loss_clip": 1.0318706, + "balance_loss_mlp": 1.00692785, + "epoch": 0.2478205976071665, + "flos": 22163814927360.0, + "grad_norm": 3.973978614144453, + "language_loss": 0.64385736, + "learning_rate": 3.522757941872666e-06, + "loss": 0.6658743, + "num_input_tokens_seen": 44014865, + "step": 2061, + "time_per_iteration": 2.7819645404815674 + }, + { + "auxiliary_loss_clip": 0.01159838, + "auxiliary_loss_mlp": 0.00873989, + "balance_loss_clip": 1.04008365, + "balance_loss_mlp": 0.99994874, + "epoch": 0.24794084049780557, + "flos": 24973106555520.0, + "grad_norm": 1.5515764258852505, + "language_loss": 0.82528061, + "learning_rate": 3.5222528135552042e-06, + "loss": 0.8456189, + "num_input_tokens_seen": 44036325, + "step": 2062, + "time_per_iteration": 2.7001168727874756 + }, + { + "auxiliary_loss_clip": 0.01145377, + "auxiliary_loss_mlp": 0.01089019, + "balance_loss_clip": 1.03732753, + "balance_loss_mlp": 1.00719368, + "epoch": 0.24806108338844465, + "flos": 18296379521280.0, + "grad_norm": 2.069566481573434, + "language_loss": 0.80547738, + "learning_rate": 3.521747454313201e-06, + "loss": 0.82782137, + "num_input_tokens_seen": 44055005, + "step": 2063, + "time_per_iteration": 2.6693034172058105 + }, + { + "auxiliary_loss_clip": 0.01127547, + "auxiliary_loss_mlp": 0.01088869, + "balance_loss_clip": 1.03335667, + "balance_loss_mlp": 1.00709176, + "epoch": 0.24818132627908374, + "flos": 19282163351040.0, + "grad_norm": 1.9862500643517906, + "language_loss": 0.66652369, + "learning_rate": 3.521241864223319e-06, + "loss": 0.6886878, + "num_input_tokens_seen": 44073965, + "step": 2064, + "time_per_iteration": 2.734980344772339 + }, + { + "auxiliary_loss_clip": 0.01131177, + "auxiliary_loss_mlp": 0.01080779, + "balance_loss_clip": 1.03999615, + "balance_loss_mlp": 1.00105143, + "epoch": 0.24830156916972285, + "flos": 70285837881600.0, + "grad_norm": 0.7974977920914165, + "language_loss": 0.62080377, + "learning_rate": 3.5207360433622552e-06, + "loss": 0.64292336, + "num_input_tokens_seen": 44135965, + "step": 2065, + "time_per_iteration": 3.308382987976074 + }, + { + "auxiliary_loss_clip": 0.01138256, + "auxiliary_loss_mlp": 0.01090632, + "balance_loss_clip": 1.03657269, + "balance_loss_mlp": 1.00904536, + "epoch": 0.24842181206036193, + "flos": 40409128287360.0, + "grad_norm": 1.6893368549010386, + "language_loss": 0.745713, + "learning_rate": 3.5202299918067437e-06, + "loss": 0.76800185, + "num_input_tokens_seen": 44159560, + "step": 2066, + "time_per_iteration": 2.909635305404663 + }, + { + "auxiliary_loss_clip": 0.01145872, + "auxiliary_loss_mlp": 0.01089583, + "balance_loss_clip": 1.03500068, + "balance_loss_mlp": 1.00794828, + "epoch": 0.248542054951001, + "flos": 20082432412800.0, + "grad_norm": 2.567346488511928, + "language_loss": 0.69249451, + "learning_rate": 3.519723709633551e-06, + "loss": 0.714849, + "num_input_tokens_seen": 44178320, + "step": 2067, + "time_per_iteration": 2.7313029766082764 + }, + { + "auxiliary_loss_clip": 0.01138059, + "auxiliary_loss_mlp": 0.010912, + "balance_loss_clip": 1.03531623, + "balance_loss_mlp": 1.00951719, + "epoch": 0.24866229784164012, + "flos": 23513948363520.0, + "grad_norm": 1.8203902882622398, + "language_loss": 0.83642602, + "learning_rate": 3.519217196919479e-06, + "loss": 0.85871857, + "num_input_tokens_seen": 44197305, + "step": 2068, + "time_per_iteration": 2.7588398456573486 + }, + { + "auxiliary_loss_clip": 0.01136789, + "auxiliary_loss_mlp": 0.01089878, + "balance_loss_clip": 1.03507459, + "balance_loss_mlp": 1.00805235, + "epoch": 0.2487825407322792, + "flos": 19865101173120.0, + "grad_norm": 1.69497891780627, + "language_loss": 0.73119438, + "learning_rate": 3.518710453741367e-06, + "loss": 0.75346106, + "num_input_tokens_seen": 44216505, + "step": 2069, + "time_per_iteration": 2.71626877784729 + }, + { + "auxiliary_loss_clip": 0.01136607, + "auxiliary_loss_mlp": 0.00873983, + "balance_loss_clip": 1.03407753, + "balance_loss_mlp": 0.99986446, + "epoch": 0.2489027836229183, + "flos": 22017622573440.0, + "grad_norm": 5.2553077392530945, + "language_loss": 0.67423391, + "learning_rate": 3.518203480176086e-06, + "loss": 0.69433987, + "num_input_tokens_seen": 44235435, + "step": 2070, + "time_per_iteration": 2.784153938293457 + }, + { + "auxiliary_loss_clip": 0.01098763, + "auxiliary_loss_mlp": 0.01088577, + "balance_loss_clip": 1.029634, + "balance_loss_mlp": 1.00694203, + "epoch": 0.2490230265135574, + "flos": 23294354567040.0, + "grad_norm": 1.7554455459480194, + "language_loss": 0.80497527, + "learning_rate": 3.517696276300545e-06, + "loss": 0.82684863, + "num_input_tokens_seen": 44256975, + "step": 2071, + "time_per_iteration": 2.953479290008545 + }, + { + "auxiliary_loss_clip": 0.01140979, + "auxiliary_loss_mlp": 0.01090888, + "balance_loss_clip": 1.0364908, + "balance_loss_mlp": 1.00906253, + "epoch": 0.24914326940419648, + "flos": 19826784339840.0, + "grad_norm": 2.4979021172102267, + "language_loss": 0.69209731, + "learning_rate": 3.517188842191685e-06, + "loss": 0.71441591, + "num_input_tokens_seen": 44275125, + "step": 2072, + "time_per_iteration": 3.7130630016326904 + }, + { + "auxiliary_loss_clip": 0.01147354, + "auxiliary_loss_mlp": 0.01089698, + "balance_loss_clip": 1.0356245, + "balance_loss_mlp": 1.00782514, + "epoch": 0.24926351229483557, + "flos": 20229271211520.0, + "grad_norm": 1.6308313264879608, + "language_loss": 0.74082088, + "learning_rate": 3.5166811779264837e-06, + "loss": 0.76319134, + "num_input_tokens_seen": 44295445, + "step": 2073, + "time_per_iteration": 2.745896339416504 + }, + { + "auxiliary_loss_clip": 0.01154296, + "auxiliary_loss_mlp": 0.01088004, + "balance_loss_clip": 1.03490996, + "balance_loss_mlp": 1.00632226, + "epoch": 0.24938375518547465, + "flos": 23294570048640.0, + "grad_norm": 1.7475464362761401, + "language_loss": 0.77732158, + "learning_rate": 3.5161732835819545e-06, + "loss": 0.79974455, + "num_input_tokens_seen": 44314755, + "step": 2074, + "time_per_iteration": 2.7414255142211914 + }, + { + "auxiliary_loss_clip": 0.01156987, + "auxiliary_loss_mlp": 0.01088707, + "balance_loss_clip": 1.03747749, + "balance_loss_mlp": 1.00697756, + "epoch": 0.24950399807611376, + "flos": 17311673099520.0, + "grad_norm": 1.7382620912662825, + "language_loss": 0.8312906, + "learning_rate": 3.515665159235143e-06, + "loss": 0.85374761, + "num_input_tokens_seen": 44333640, + "step": 2075, + "time_per_iteration": 2.7417819499969482 + }, + { + "auxiliary_loss_clip": 0.01137242, + "auxiliary_loss_mlp": 0.01087678, + "balance_loss_clip": 1.03416622, + "balance_loss_mlp": 1.00618625, + "epoch": 0.24962424096675284, + "flos": 19024863252480.0, + "grad_norm": 1.5514337425142903, + "language_loss": 0.74808574, + "learning_rate": 3.5151568049631318e-06, + "loss": 0.77033496, + "num_input_tokens_seen": 44352355, + "step": 2076, + "time_per_iteration": 3.7527997493743896 + }, + { + "auxiliary_loss_clip": 0.01155526, + "auxiliary_loss_mlp": 0.01089931, + "balance_loss_clip": 1.03579211, + "balance_loss_mlp": 1.00820124, + "epoch": 0.24974448385739192, + "flos": 33398790710400.0, + "grad_norm": 1.785402535347892, + "language_loss": 0.80245745, + "learning_rate": 3.5146482208430385e-06, + "loss": 0.82491195, + "num_input_tokens_seen": 44374185, + "step": 2077, + "time_per_iteration": 2.7223823070526123 + }, + { + "auxiliary_loss_clip": 0.01106672, + "auxiliary_loss_mlp": 0.01088563, + "balance_loss_clip": 1.02989888, + "balance_loss_mlp": 1.00659466, + "epoch": 0.24986472674803104, + "flos": 30007279532160.0, + "grad_norm": 1.9148278707089934, + "language_loss": 0.67834985, + "learning_rate": 3.514139406952014e-06, + "loss": 0.70030224, + "num_input_tokens_seen": 44396210, + "step": 2078, + "time_per_iteration": 3.9252169132232666 + }, + { + "auxiliary_loss_clip": 0.01145729, + "auxiliary_loss_mlp": 0.01087648, + "balance_loss_clip": 1.03514206, + "balance_loss_mlp": 1.00606132, + "epoch": 0.24998496963867012, + "flos": 26613074833920.0, + "grad_norm": 1.859193175609056, + "language_loss": 0.83714771, + "learning_rate": 3.5136303633672454e-06, + "loss": 0.85948151, + "num_input_tokens_seen": 44416340, + "step": 2079, + "time_per_iteration": 2.7503936290740967 + }, + { + "auxiliary_loss_clip": 0.01128274, + "auxiliary_loss_mlp": 0.00873913, + "balance_loss_clip": 1.03394318, + "balance_loss_mlp": 0.9999029, + "epoch": 0.25010521252930923, + "flos": 23553989049600.0, + "grad_norm": 1.5463137247989263, + "language_loss": 0.74686658, + "learning_rate": 3.5131210901659544e-06, + "loss": 0.7668885, + "num_input_tokens_seen": 44438095, + "step": 2080, + "time_per_iteration": 3.797950029373169 + }, + { + "auxiliary_loss_clip": 0.01125462, + "auxiliary_loss_mlp": 0.01088056, + "balance_loss_clip": 1.03236604, + "balance_loss_mlp": 1.00642145, + "epoch": 0.2502254554199483, + "flos": 23441193365760.0, + "grad_norm": 2.385500243652611, + "language_loss": 0.82036763, + "learning_rate": 3.5126115874253967e-06, + "loss": 0.84250283, + "num_input_tokens_seen": 44457650, + "step": 2081, + "time_per_iteration": 2.7830734252929688 + }, + { + "auxiliary_loss_clip": 0.01122319, + "auxiliary_loss_mlp": 0.01088328, + "balance_loss_clip": 1.0327363, + "balance_loss_mlp": 1.00664556, + "epoch": 0.2503456983105874, + "flos": 28761681651840.0, + "grad_norm": 2.037486557019455, + "language_loss": 0.81004483, + "learning_rate": 3.5121018552228644e-06, + "loss": 0.83215129, + "num_input_tokens_seen": 44476155, + "step": 2082, + "time_per_iteration": 2.8953845500946045 + }, + { + "auxiliary_loss_clip": 0.01124721, + "auxiliary_loss_mlp": 0.01086754, + "balance_loss_clip": 1.0329473, + "balance_loss_mlp": 1.00516701, + "epoch": 0.2504659412012265, + "flos": 18770256673920.0, + "grad_norm": 6.6284053804766705, + "language_loss": 0.75991642, + "learning_rate": 3.5115918936356827e-06, + "loss": 0.78203118, + "num_input_tokens_seen": 44492910, + "step": 2083, + "time_per_iteration": 2.848707675933838 + }, + { + "auxiliary_loss_clip": 0.0112722, + "auxiliary_loss_mlp": 0.01089573, + "balance_loss_clip": 1.03406441, + "balance_loss_mlp": 1.00793803, + "epoch": 0.25058618409186556, + "flos": 16873383346560.0, + "grad_norm": 1.8013431785048395, + "language_loss": 0.78969455, + "learning_rate": 3.5110817027412123e-06, + "loss": 0.81186247, + "num_input_tokens_seen": 44512000, + "step": 2084, + "time_per_iteration": 2.794809341430664 + }, + { + "auxiliary_loss_clip": 0.01127612, + "auxiliary_loss_mlp": 0.01088637, + "balance_loss_clip": 1.03407574, + "balance_loss_mlp": 1.00709796, + "epoch": 0.25070642698250467, + "flos": 24425540651520.0, + "grad_norm": 4.146880442914958, + "language_loss": 0.68825591, + "learning_rate": 3.5105712826168493e-06, + "loss": 0.71041846, + "num_input_tokens_seen": 44531650, + "step": 2085, + "time_per_iteration": 2.832155704498291 + }, + { + "auxiliary_loss_clip": 0.01145438, + "auxiliary_loss_mlp": 0.00873774, + "balance_loss_clip": 1.03450394, + "balance_loss_mlp": 0.99986166, + "epoch": 0.2508266698731437, + "flos": 20260944028800.0, + "grad_norm": 1.739161666592525, + "language_loss": 0.70740056, + "learning_rate": 3.5100606333400235e-06, + "loss": 0.72759265, + "num_input_tokens_seen": 44548785, + "step": 2086, + "time_per_iteration": 2.725252151489258 + }, + { + "auxiliary_loss_clip": 0.01139361, + "auxiliary_loss_mlp": 0.01088316, + "balance_loss_clip": 1.03558135, + "balance_loss_mlp": 1.00653887, + "epoch": 0.25094691276378284, + "flos": 19245318975360.0, + "grad_norm": 2.0278255989341627, + "language_loss": 0.77177435, + "learning_rate": 3.5095497549882006e-06, + "loss": 0.79405111, + "num_input_tokens_seen": 44567230, + "step": 2087, + "time_per_iteration": 2.772305727005005 + }, + { + "auxiliary_loss_clip": 0.01146202, + "auxiliary_loss_mlp": 0.01090033, + "balance_loss_clip": 1.03610349, + "balance_loss_mlp": 1.008255, + "epoch": 0.25106715565442195, + "flos": 26943237671040.0, + "grad_norm": 2.007052681101528, + "language_loss": 0.72642243, + "learning_rate": 3.50903864763888e-06, + "loss": 0.74878472, + "num_input_tokens_seen": 44588020, + "step": 2088, + "time_per_iteration": 2.7235910892486572 + }, + { + "auxiliary_loss_clip": 0.01147093, + "auxiliary_loss_mlp": 0.01087291, + "balance_loss_clip": 1.03585052, + "balance_loss_mlp": 1.00556111, + "epoch": 0.251187398545061, + "flos": 48359570572800.0, + "grad_norm": 2.1683085185539883, + "language_loss": 0.76335764, + "learning_rate": 3.5085273113695965e-06, + "loss": 0.78570145, + "num_input_tokens_seen": 44612590, + "step": 2089, + "time_per_iteration": 2.881084680557251 + }, + { + "auxiliary_loss_clip": 0.01155622, + "auxiliary_loss_mlp": 0.01089713, + "balance_loss_clip": 1.03631413, + "balance_loss_mlp": 1.00784016, + "epoch": 0.2513076414357001, + "flos": 27016100409600.0, + "grad_norm": 2.3119223092526666, + "language_loss": 0.78339881, + "learning_rate": 3.508015746257919e-06, + "loss": 0.80585217, + "num_input_tokens_seen": 44631630, + "step": 2090, + "time_per_iteration": 2.7247540950775146 + }, + { + "auxiliary_loss_clip": 0.01124967, + "auxiliary_loss_mlp": 0.01089353, + "balance_loss_clip": 1.03229165, + "balance_loss_mlp": 1.00767088, + "epoch": 0.2514278843263392, + "flos": 19463619882240.0, + "grad_norm": 1.8401787740732505, + "language_loss": 0.83385652, + "learning_rate": 3.5075039523814518e-06, + "loss": 0.85599971, + "num_input_tokens_seen": 44650820, + "step": 2091, + "time_per_iteration": 2.770663022994995 + }, + { + "auxiliary_loss_clip": 0.01145547, + "auxiliary_loss_mlp": 0.01088626, + "balance_loss_clip": 1.03381968, + "balance_loss_mlp": 1.0067054, + "epoch": 0.2515481272169783, + "flos": 16866092885760.0, + "grad_norm": 2.0291216643665453, + "language_loss": 0.81765431, + "learning_rate": 3.506991929817834e-06, + "loss": 0.83999598, + "num_input_tokens_seen": 44667540, + "step": 2092, + "time_per_iteration": 2.755021095275879 + }, + { + "auxiliary_loss_clip": 0.01156826, + "auxiliary_loss_mlp": 0.01089275, + "balance_loss_clip": 1.03863478, + "balance_loss_mlp": 1.00773644, + "epoch": 0.2516683701076174, + "flos": 23732464752000.0, + "grad_norm": 1.7066513746771785, + "language_loss": 0.83077556, + "learning_rate": 3.506479678644738e-06, + "loss": 0.85323656, + "num_input_tokens_seen": 44687935, + "step": 2093, + "time_per_iteration": 2.789532423019409 + }, + { + "auxiliary_loss_clip": 0.01110734, + "auxiliary_loss_mlp": 0.01088476, + "balance_loss_clip": 1.02986169, + "balance_loss_mlp": 1.00669837, + "epoch": 0.2517886129982565, + "flos": 27635954434560.0, + "grad_norm": 2.3792453178111135, + "language_loss": 0.73767614, + "learning_rate": 3.505967198939873e-06, + "loss": 0.75966829, + "num_input_tokens_seen": 44704975, + "step": 2094, + "time_per_iteration": 2.9267332553863525 + }, + { + "auxiliary_loss_clip": 0.01131275, + "auxiliary_loss_mlp": 0.01088819, + "balance_loss_clip": 1.03183079, + "balance_loss_mlp": 1.00723195, + "epoch": 0.25190885588889556, + "flos": 38104596529920.0, + "grad_norm": 1.816995882892376, + "language_loss": 0.78023875, + "learning_rate": 3.5054544907809813e-06, + "loss": 0.80243969, + "num_input_tokens_seen": 44725475, + "step": 2095, + "time_per_iteration": 2.9828124046325684 + }, + { + "auxiliary_loss_clip": 0.0113285, + "auxiliary_loss_mlp": 0.00873867, + "balance_loss_clip": 1.03365374, + "balance_loss_mlp": 0.99984848, + "epoch": 0.25202909877953467, + "flos": 22269894768000.0, + "grad_norm": 2.0767460365884074, + "language_loss": 0.80291915, + "learning_rate": 3.50494155424584e-06, + "loss": 0.82298625, + "num_input_tokens_seen": 44744380, + "step": 2096, + "time_per_iteration": 2.795305013656616 + }, + { + "auxiliary_loss_clip": 0.01146689, + "auxiliary_loss_mlp": 0.01087623, + "balance_loss_clip": 1.03561211, + "balance_loss_mlp": 1.00603628, + "epoch": 0.2521493416701738, + "flos": 21761759018880.0, + "grad_norm": 1.6878507674854437, + "language_loss": 0.83225131, + "learning_rate": 3.504428389412262e-06, + "loss": 0.85459435, + "num_input_tokens_seen": 44765190, + "step": 2097, + "time_per_iteration": 2.832288980484009 + }, + { + "auxiliary_loss_clip": 0.01148453, + "auxiliary_loss_mlp": 0.01088663, + "balance_loss_clip": 1.03717613, + "balance_loss_mlp": 1.0070281, + "epoch": 0.25226958456081283, + "flos": 27746738956800.0, + "grad_norm": 2.049023528194493, + "language_loss": 0.73042941, + "learning_rate": 3.5039149963580927e-06, + "loss": 0.75280058, + "num_input_tokens_seen": 44785210, + "step": 2098, + "time_per_iteration": 3.798630714416504 + }, + { + "auxiliary_loss_clip": 0.01126495, + "auxiliary_loss_mlp": 0.01087309, + "balance_loss_clip": 1.03385592, + "balance_loss_mlp": 1.00572205, + "epoch": 0.25238982745145194, + "flos": 30732171903360.0, + "grad_norm": 2.3013568565991562, + "language_loss": 0.7056998, + "learning_rate": 3.503401375161215e-06, + "loss": 0.72783786, + "num_input_tokens_seen": 44804955, + "step": 2099, + "time_per_iteration": 2.848543643951416 + }, + { + "auxiliary_loss_clip": 0.01153609, + "auxiliary_loss_mlp": 0.01089383, + "balance_loss_clip": 1.03470266, + "balance_loss_mlp": 1.00774813, + "epoch": 0.252510070342091, + "flos": 20266331068800.0, + "grad_norm": 1.5633752883400118, + "language_loss": 0.83617318, + "learning_rate": 3.502887525899544e-06, + "loss": 0.85860312, + "num_input_tokens_seen": 44823935, + "step": 2100, + "time_per_iteration": 2.808417797088623 + }, + { + "auxiliary_loss_clip": 0.01136479, + "auxiliary_loss_mlp": 0.01088647, + "balance_loss_clip": 1.03443253, + "balance_loss_mlp": 1.006917, + "epoch": 0.2526303132327301, + "flos": 22747399194240.0, + "grad_norm": 4.242540434757377, + "language_loss": 0.82612777, + "learning_rate": 3.50237344865103e-06, + "loss": 0.84837902, + "num_input_tokens_seen": 44844935, + "step": 2101, + "time_per_iteration": 2.811457633972168 + }, + { + "auxiliary_loss_clip": 0.01155771, + "auxiliary_loss_mlp": 0.01088281, + "balance_loss_clip": 1.03659499, + "balance_loss_mlp": 1.00650311, + "epoch": 0.2527505561233692, + "flos": 30263466309120.0, + "grad_norm": 2.085659536513558, + "language_loss": 0.76346481, + "learning_rate": 3.501859143493658e-06, + "loss": 0.7859053, + "num_input_tokens_seen": 44865565, + "step": 2102, + "time_per_iteration": 4.14337420463562 + }, + { + "auxiliary_loss_clip": 0.01143424, + "auxiliary_loss_mlp": 0.01083633, + "balance_loss_clip": 1.03882313, + "balance_loss_mlp": 1.00390553, + "epoch": 0.2528707990140083, + "flos": 58492917164160.0, + "grad_norm": 0.9280767856225671, + "language_loss": 0.60589212, + "learning_rate": 3.5013446105054488e-06, + "loss": 0.62816268, + "num_input_tokens_seen": 44918485, + "step": 2103, + "time_per_iteration": 4.043054819107056 + }, + { + "auxiliary_loss_clip": 0.01119053, + "auxiliary_loss_mlp": 0.0108877, + "balance_loss_clip": 1.03289258, + "balance_loss_mlp": 1.00723124, + "epoch": 0.2529910419046474, + "flos": 24645134448000.0, + "grad_norm": 1.7645688374907516, + "language_loss": 0.74793315, + "learning_rate": 3.5008298497644555e-06, + "loss": 0.77001137, + "num_input_tokens_seen": 44937530, + "step": 2104, + "time_per_iteration": 2.868563175201416 + }, + { + "auxiliary_loss_clip": 0.01117133, + "auxiliary_loss_mlp": 0.01088081, + "balance_loss_clip": 1.0335685, + "balance_loss_mlp": 1.0064466, + "epoch": 0.2531112847952865, + "flos": 23842135952640.0, + "grad_norm": 1.6472120706672773, + "language_loss": 0.87769926, + "learning_rate": 3.500314861348767e-06, + "loss": 0.89975142, + "num_input_tokens_seen": 44958165, + "step": 2105, + "time_per_iteration": 3.8190784454345703 + }, + { + "auxiliary_loss_clip": 0.01124506, + "auxiliary_loss_mlp": 0.01086816, + "balance_loss_clip": 1.03249669, + "balance_loss_mlp": 1.00527728, + "epoch": 0.25323152768592555, + "flos": 16143822207360.0, + "grad_norm": 1.8393782073578455, + "language_loss": 0.7742182, + "learning_rate": 3.499799645336507e-06, + "loss": 0.79633141, + "num_input_tokens_seen": 44975060, + "step": 2106, + "time_per_iteration": 2.8754494190216064 + }, + { + "auxiliary_loss_clip": 0.01146918, + "auxiliary_loss_mlp": 0.01087403, + "balance_loss_clip": 1.0366776, + "balance_loss_mlp": 1.00595903, + "epoch": 0.25335177057656466, + "flos": 28405161210240.0, + "grad_norm": 1.36603533253585, + "language_loss": 0.86794508, + "learning_rate": 3.4992842018058336e-06, + "loss": 0.89028823, + "num_input_tokens_seen": 44997960, + "step": 2107, + "time_per_iteration": 2.8617677688598633 + }, + { + "auxiliary_loss_clip": 0.01126969, + "auxiliary_loss_mlp": 0.0109037, + "balance_loss_clip": 1.03334367, + "balance_loss_mlp": 1.0087359, + "epoch": 0.25347201346720377, + "flos": 18799666934400.0, + "grad_norm": 2.055245127738092, + "language_loss": 0.89089757, + "learning_rate": 3.4987685308349384e-06, + "loss": 0.91307098, + "num_input_tokens_seen": 45015690, + "step": 2108, + "time_per_iteration": 2.9133362770080566 + }, + { + "auxiliary_loss_clip": 0.01128998, + "auxiliary_loss_mlp": 0.01089512, + "balance_loss_clip": 1.03357124, + "balance_loss_mlp": 1.0078299, + "epoch": 0.2535922563578428, + "flos": 15815490963840.0, + "grad_norm": 2.012746059301957, + "language_loss": 0.60972291, + "learning_rate": 3.4982526325020497e-06, + "loss": 0.63190806, + "num_input_tokens_seen": 45032660, + "step": 2109, + "time_per_iteration": 2.9335060119628906 + }, + { + "auxiliary_loss_clip": 0.01127214, + "auxiliary_loss_mlp": 0.01089421, + "balance_loss_clip": 1.035254, + "balance_loss_mlp": 1.00769126, + "epoch": 0.25371249924848194, + "flos": 16318922031360.0, + "grad_norm": 3.0828241034136394, + "language_loss": 0.81705534, + "learning_rate": 3.4977365068854273e-06, + "loss": 0.83922172, + "num_input_tokens_seen": 45048280, + "step": 2110, + "time_per_iteration": 2.8741681575775146 + }, + { + "auxiliary_loss_clip": 0.01135267, + "auxiliary_loss_mlp": 0.01089559, + "balance_loss_clip": 1.03459251, + "balance_loss_mlp": 1.00792491, + "epoch": 0.25383274213912105, + "flos": 21761615364480.0, + "grad_norm": 1.7118163342287434, + "language_loss": 0.73832083, + "learning_rate": 3.4972201540633676e-06, + "loss": 0.7605691, + "num_input_tokens_seen": 45067635, + "step": 2111, + "time_per_iteration": 2.900660753250122 + }, + { + "auxiliary_loss_clip": 0.0113608, + "auxiliary_loss_mlp": 0.01089801, + "balance_loss_clip": 1.03484201, + "balance_loss_mlp": 1.00821412, + "epoch": 0.2539529850297601, + "flos": 21396870708480.0, + "grad_norm": 1.9316665122623198, + "language_loss": 0.84942859, + "learning_rate": 3.4967035741142008e-06, + "loss": 0.87168741, + "num_input_tokens_seen": 45086455, + "step": 2112, + "time_per_iteration": 2.831545352935791 + }, + { + "auxiliary_loss_clip": 0.01128016, + "auxiliary_loss_mlp": 0.01087714, + "balance_loss_clip": 1.03033972, + "balance_loss_mlp": 1.00646114, + "epoch": 0.2540732279203992, + "flos": 25228467319680.0, + "grad_norm": 1.7654366316003778, + "language_loss": 0.82079297, + "learning_rate": 3.4961867671162917e-06, + "loss": 0.84295022, + "num_input_tokens_seen": 45106385, + "step": 2113, + "time_per_iteration": 2.873076915740967 + }, + { + "auxiliary_loss_clip": 0.01154966, + "auxiliary_loss_mlp": 0.01088133, + "balance_loss_clip": 1.03582799, + "balance_loss_mlp": 1.00640261, + "epoch": 0.2541934708110383, + "flos": 19427386037760.0, + "grad_norm": 2.495285863013618, + "language_loss": 0.76941878, + "learning_rate": 3.4956697331480402e-06, + "loss": 0.79184979, + "num_input_tokens_seen": 45124955, + "step": 2114, + "time_per_iteration": 2.8188908100128174 + }, + { + "auxiliary_loss_clip": 0.01117758, + "auxiliary_loss_mlp": 0.01089508, + "balance_loss_clip": 1.03307545, + "balance_loss_mlp": 1.00777829, + "epoch": 0.2543137137016774, + "flos": 23949436855680.0, + "grad_norm": 1.4369310870621699, + "language_loss": 0.79949486, + "learning_rate": 3.495152472287879e-06, + "loss": 0.82156748, + "num_input_tokens_seen": 45145665, + "step": 2115, + "time_per_iteration": 2.901602029800415 + }, + { + "auxiliary_loss_clip": 0.01126352, + "auxiliary_loss_mlp": 0.01088989, + "balance_loss_clip": 1.03412604, + "balance_loss_mlp": 1.00716329, + "epoch": 0.2544339565923165, + "flos": 25593283802880.0, + "grad_norm": 2.2942570826606494, + "language_loss": 0.74020261, + "learning_rate": 3.4946349846142766e-06, + "loss": 0.76235604, + "num_input_tokens_seen": 45164805, + "step": 2116, + "time_per_iteration": 2.8541057109832764 + }, + { + "auxiliary_loss_clip": 0.01153273, + "auxiliary_loss_mlp": 0.01088144, + "balance_loss_clip": 1.03393602, + "balance_loss_mlp": 1.00650907, + "epoch": 0.25455419948295555, + "flos": 21689470897920.0, + "grad_norm": 1.8801909719685894, + "language_loss": 0.757972, + "learning_rate": 3.4941172702057353e-06, + "loss": 0.78038621, + "num_input_tokens_seen": 45184865, + "step": 2117, + "time_per_iteration": 2.7754971981048584 + }, + { + "auxiliary_loss_clip": 0.01135154, + "auxiliary_loss_mlp": 0.01087947, + "balance_loss_clip": 1.03442049, + "balance_loss_mlp": 1.00645542, + "epoch": 0.25467444237359466, + "flos": 26250341339520.0, + "grad_norm": 1.8458143593065532, + "language_loss": 0.80760229, + "learning_rate": 3.4935993291407924e-06, + "loss": 0.82983327, + "num_input_tokens_seen": 45203690, + "step": 2118, + "time_per_iteration": 2.8372678756713867 + }, + { + "auxiliary_loss_clip": 0.01138656, + "auxiliary_loss_mlp": 0.01089253, + "balance_loss_clip": 1.03546524, + "balance_loss_mlp": 1.00757098, + "epoch": 0.25479468526423377, + "flos": 26979686997120.0, + "grad_norm": 2.1448282115874933, + "language_loss": 0.71482563, + "learning_rate": 3.4930811614980183e-06, + "loss": 0.73710477, + "num_input_tokens_seen": 45225385, + "step": 2119, + "time_per_iteration": 2.802093505859375 + }, + { + "auxiliary_loss_clip": 0.01146939, + "auxiliary_loss_mlp": 0.01090042, + "balance_loss_clip": 1.03602421, + "balance_loss_mlp": 1.00845551, + "epoch": 0.2549149281548728, + "flos": 23475811098240.0, + "grad_norm": 1.6003066133095631, + "language_loss": 0.79309684, + "learning_rate": 3.4925627673560198e-06, + "loss": 0.81546664, + "num_input_tokens_seen": 45246045, + "step": 2120, + "time_per_iteration": 2.794481039047241 + }, + { + "auxiliary_loss_clip": 0.01119156, + "auxiliary_loss_mlp": 0.01088119, + "balance_loss_clip": 1.02839613, + "balance_loss_mlp": 1.00643635, + "epoch": 0.25503517104551193, + "flos": 25812302981760.0, + "grad_norm": 1.7805046013251629, + "language_loss": 0.8838433, + "learning_rate": 3.4920441467934357e-06, + "loss": 0.90591609, + "num_input_tokens_seen": 45266560, + "step": 2121, + "time_per_iteration": 2.8332011699676514 + }, + { + "auxiliary_loss_clip": 0.01123916, + "auxiliary_loss_mlp": 0.01088535, + "balance_loss_clip": 1.03242064, + "balance_loss_mlp": 1.00699544, + "epoch": 0.25515541393615104, + "flos": 26645106787200.0, + "grad_norm": 2.116091043396192, + "language_loss": 0.83154881, + "learning_rate": 3.491525299888941e-06, + "loss": 0.85367334, + "num_input_tokens_seen": 45285405, + "step": 2122, + "time_per_iteration": 2.8460195064544678 + }, + { + "auxiliary_loss_clip": 0.01118395, + "auxiliary_loss_mlp": 0.00873159, + "balance_loss_clip": 1.04065537, + "balance_loss_mlp": 0.99975818, + "epoch": 0.2552756568267901, + "flos": 65955945847680.0, + "grad_norm": 0.8859751876212759, + "language_loss": 0.62766117, + "learning_rate": 3.491006226721244e-06, + "loss": 0.64757669, + "num_input_tokens_seen": 45349615, + "step": 2123, + "time_per_iteration": 3.374561071395874 + }, + { + "auxiliary_loss_clip": 0.01127246, + "auxiliary_loss_mlp": 0.00873749, + "balance_loss_clip": 1.03542781, + "balance_loss_mlp": 0.99986398, + "epoch": 0.2553958997174292, + "flos": 17931096161280.0, + "grad_norm": 1.830805727059156, + "language_loss": 0.77723253, + "learning_rate": 3.4904869273690882e-06, + "loss": 0.79724252, + "num_input_tokens_seen": 45367505, + "step": 2124, + "time_per_iteration": 3.6843173503875732 + }, + { + "auxiliary_loss_clip": 0.0114774, + "auxiliary_loss_mlp": 0.01088672, + "balance_loss_clip": 1.036551, + "balance_loss_mlp": 1.00694203, + "epoch": 0.2555161426080683, + "flos": 23367791923200.0, + "grad_norm": 2.2135879950025257, + "language_loss": 0.89026427, + "learning_rate": 3.489967401911251e-06, + "loss": 0.91262841, + "num_input_tokens_seen": 45386805, + "step": 2125, + "time_per_iteration": 2.833545446395874 + }, + { + "auxiliary_loss_clip": 0.01156271, + "auxiliary_loss_mlp": 0.01088292, + "balance_loss_clip": 1.03681862, + "balance_loss_mlp": 1.00646687, + "epoch": 0.2556363854987074, + "flos": 40625130723840.0, + "grad_norm": 1.5670005104019542, + "language_loss": 0.69316393, + "learning_rate": 3.4894476504265428e-06, + "loss": 0.71560955, + "num_input_tokens_seen": 45411045, + "step": 2126, + "time_per_iteration": 2.8379018306732178 + }, + { + "auxiliary_loss_clip": 0.01133679, + "auxiliary_loss_mlp": 0.0108043, + "balance_loss_clip": 1.0378983, + "balance_loss_mlp": 1.00070274, + "epoch": 0.2557566283893465, + "flos": 68019443389440.0, + "grad_norm": 0.7394827357145131, + "language_loss": 0.5461669, + "learning_rate": 3.4889276729938104e-06, + "loss": 0.568308, + "num_input_tokens_seen": 45469575, + "step": 2127, + "time_per_iteration": 3.219942808151245 + }, + { + "auxiliary_loss_clip": 0.01127041, + "auxiliary_loss_mlp": 0.01088925, + "balance_loss_clip": 1.03258634, + "balance_loss_mlp": 1.00714719, + "epoch": 0.2558768712799856, + "flos": 22635645004800.0, + "grad_norm": 2.2668915860727012, + "language_loss": 0.8069495, + "learning_rate": 3.488407469691934e-06, + "loss": 0.82910919, + "num_input_tokens_seen": 45490270, + "step": 2128, + "time_per_iteration": 3.8616676330566406 + }, + { + "auxiliary_loss_clip": 0.01139068, + "auxiliary_loss_mlp": 0.0109321, + "balance_loss_clip": 1.03582835, + "balance_loss_mlp": 1.01128983, + "epoch": 0.25599711417062465, + "flos": 26396354125440.0, + "grad_norm": 1.943729996095209, + "language_loss": 0.80780786, + "learning_rate": 3.487887040599828e-06, + "loss": 0.8301307, + "num_input_tokens_seen": 45510070, + "step": 2129, + "time_per_iteration": 3.7491812705993652 + }, + { + "auxiliary_loss_clip": 0.01156184, + "auxiliary_loss_mlp": 0.01088251, + "balance_loss_clip": 1.03717983, + "balance_loss_mlp": 1.0064739, + "epoch": 0.25611735706126376, + "flos": 22852042490880.0, + "grad_norm": 4.627955432840865, + "language_loss": 0.76090342, + "learning_rate": 3.4873663857964407e-06, + "loss": 0.78334779, + "num_input_tokens_seen": 45527285, + "step": 2130, + "time_per_iteration": 2.658099889755249 + }, + { + "auxiliary_loss_clip": 0.01110169, + "auxiliary_loss_mlp": 0.01089149, + "balance_loss_clip": 1.02796459, + "balance_loss_mlp": 1.00746703, + "epoch": 0.2562375999519028, + "flos": 23367863750400.0, + "grad_norm": 1.7623683677035638, + "language_loss": 0.6662091, + "learning_rate": 3.4868455053607556e-06, + "loss": 0.68820226, + "num_input_tokens_seen": 45546900, + "step": 2131, + "time_per_iteration": 3.7561216354370117 + }, + { + "auxiliary_loss_clip": 0.0114585, + "auxiliary_loss_mlp": 0.01089325, + "balance_loss_clip": 1.03451121, + "balance_loss_mlp": 1.00759506, + "epoch": 0.2563578428425419, + "flos": 22856962654080.0, + "grad_norm": 1.9977378621468864, + "language_loss": 0.713902, + "learning_rate": 3.486324399371789e-06, + "loss": 0.73625374, + "num_input_tokens_seen": 45566200, + "step": 2132, + "time_per_iteration": 2.735281467437744 + }, + { + "auxiliary_loss_clip": 0.01122101, + "auxiliary_loss_mlp": 0.010918, + "balance_loss_clip": 1.03273904, + "balance_loss_mlp": 1.01016581, + "epoch": 0.25647808573318104, + "flos": 21653883498240.0, + "grad_norm": 1.795733503248284, + "language_loss": 0.78352678, + "learning_rate": 3.485803067908593e-06, + "loss": 0.80566573, + "num_input_tokens_seen": 45585710, + "step": 2133, + "time_per_iteration": 2.992269277572632 + }, + { + "auxiliary_loss_clip": 0.01106893, + "auxiliary_loss_mlp": 0.01089669, + "balance_loss_clip": 1.03184462, + "balance_loss_mlp": 1.00779617, + "epoch": 0.2565983286238201, + "flos": 33730569659520.0, + "grad_norm": 1.6882796693321787, + "language_loss": 0.79697841, + "learning_rate": 3.485281511050253e-06, + "loss": 0.81894398, + "num_input_tokens_seen": 45607845, + "step": 2134, + "time_per_iteration": 3.039078712463379 + }, + { + "auxiliary_loss_clip": 0.01136671, + "auxiliary_loss_mlp": 0.0108831, + "balance_loss_clip": 1.03588426, + "balance_loss_mlp": 1.00677133, + "epoch": 0.2567185715144592, + "flos": 16216002587520.0, + "grad_norm": 2.2883312702264806, + "language_loss": 0.89853257, + "learning_rate": 3.484759728875889e-06, + "loss": 0.92078239, + "num_input_tokens_seen": 45623210, + "step": 2135, + "time_per_iteration": 2.8136374950408936 + }, + { + "auxiliary_loss_clip": 0.0111086, + "auxiliary_loss_mlp": 0.01088825, + "balance_loss_clip": 1.02938557, + "balance_loss_mlp": 1.00728607, + "epoch": 0.2568388144050983, + "flos": 17458475984640.0, + "grad_norm": 1.709877040801039, + "language_loss": 0.80703425, + "learning_rate": 3.4842377214646543e-06, + "loss": 0.82903105, + "num_input_tokens_seen": 45641505, + "step": 2136, + "time_per_iteration": 2.8256139755249023 + }, + { + "auxiliary_loss_clip": 0.01154874, + "auxiliary_loss_mlp": 0.01088313, + "balance_loss_clip": 1.03635752, + "balance_loss_mlp": 1.00686932, + "epoch": 0.25695905729573737, + "flos": 20887442069760.0, + "grad_norm": 1.6554821063582101, + "language_loss": 0.66980326, + "learning_rate": 3.483715488895737e-06, + "loss": 0.69223511, + "num_input_tokens_seen": 45661835, + "step": 2137, + "time_per_iteration": 2.610337734222412 + }, + { + "auxiliary_loss_clip": 0.01113857, + "auxiliary_loss_mlp": 0.01089609, + "balance_loss_clip": 1.03056538, + "balance_loss_mlp": 1.00778401, + "epoch": 0.2570793001863765, + "flos": 24717278914560.0, + "grad_norm": 1.794779279402946, + "language_loss": 0.78648007, + "learning_rate": 3.48319303124836e-06, + "loss": 0.80851471, + "num_input_tokens_seen": 45682215, + "step": 2138, + "time_per_iteration": 2.8586924076080322 + }, + { + "auxiliary_loss_clip": 0.011286, + "auxiliary_loss_mlp": 0.01089401, + "balance_loss_clip": 1.03035247, + "balance_loss_mlp": 1.00757599, + "epoch": 0.2571995430770156, + "flos": 26906896085760.0, + "grad_norm": 2.2130462956659573, + "language_loss": 0.67377955, + "learning_rate": 3.4826703486017798e-06, + "loss": 0.69595957, + "num_input_tokens_seen": 45701840, + "step": 2139, + "time_per_iteration": 2.7901110649108887 + }, + { + "auxiliary_loss_clip": 0.01144244, + "auxiliary_loss_mlp": 0.01090536, + "balance_loss_clip": 1.035043, + "balance_loss_mlp": 1.00899732, + "epoch": 0.25731978596765465, + "flos": 19792561656960.0, + "grad_norm": 1.54437267121873, + "language_loss": 0.76823759, + "learning_rate": 3.4821474410352867e-06, + "loss": 0.7905854, + "num_input_tokens_seen": 45720500, + "step": 2140, + "time_per_iteration": 2.6777000427246094 + }, + { + "auxiliary_loss_clip": 0.01095163, + "auxiliary_loss_mlp": 0.01080708, + "balance_loss_clip": 1.0275836, + "balance_loss_mlp": 1.00098097, + "epoch": 0.25744002885829376, + "flos": 70564970471040.0, + "grad_norm": 0.9268636464696307, + "language_loss": 0.62708169, + "learning_rate": 3.481624308628205e-06, + "loss": 0.64884037, + "num_input_tokens_seen": 45781870, + "step": 2141, + "time_per_iteration": 3.436626434326172 + }, + { + "auxiliary_loss_clip": 0.01135906, + "auxiliary_loss_mlp": 0.01089003, + "balance_loss_clip": 1.03509426, + "balance_loss_mlp": 1.00722551, + "epoch": 0.25756027174893287, + "flos": 18038181582720.0, + "grad_norm": 2.6486140306780586, + "language_loss": 1.00857925, + "learning_rate": 3.481100951459893e-06, + "loss": 1.03082836, + "num_input_tokens_seen": 45794890, + "step": 2142, + "time_per_iteration": 2.656433582305908 + }, + { + "auxiliary_loss_clip": 0.01142741, + "auxiliary_loss_mlp": 0.01090955, + "balance_loss_clip": 1.03384662, + "balance_loss_mlp": 1.00927269, + "epoch": 0.2576805146395719, + "flos": 22674069578880.0, + "grad_norm": 2.0646501158343096, + "language_loss": 0.78777444, + "learning_rate": 3.4805773696097453e-06, + "loss": 0.81011146, + "num_input_tokens_seen": 45815780, + "step": 2143, + "time_per_iteration": 2.83524751663208 + }, + { + "auxiliary_loss_clip": 0.01120533, + "auxiliary_loss_mlp": 0.0109036, + "balance_loss_clip": 1.02817488, + "balance_loss_mlp": 1.00872517, + "epoch": 0.25780075753021103, + "flos": 16472225278080.0, + "grad_norm": 2.137995079400924, + "language_loss": 0.87711549, + "learning_rate": 3.4800535631571874e-06, + "loss": 0.8992244, + "num_input_tokens_seen": 45831310, + "step": 2144, + "time_per_iteration": 2.798581123352051 + }, + { + "auxiliary_loss_clip": 0.01134588, + "auxiliary_loss_mlp": 0.01090318, + "balance_loss_clip": 1.03330088, + "balance_loss_mlp": 1.00830173, + "epoch": 0.25792100042085014, + "flos": 22820297846400.0, + "grad_norm": 2.048656885599085, + "language_loss": 0.76192695, + "learning_rate": 3.4795295321816804e-06, + "loss": 0.78417599, + "num_input_tokens_seen": 45850135, + "step": 2145, + "time_per_iteration": 2.8329732418060303 + }, + { + "auxiliary_loss_clip": 0.0113304, + "auxiliary_loss_mlp": 0.01088525, + "balance_loss_clip": 1.03293276, + "balance_loss_mlp": 1.00679493, + "epoch": 0.2580412433114892, + "flos": 18697286194560.0, + "grad_norm": 2.0603427858146084, + "language_loss": 0.90881497, + "learning_rate": 3.47900527676272e-06, + "loss": 0.93103063, + "num_input_tokens_seen": 45868470, + "step": 2146, + "time_per_iteration": 2.7398836612701416 + }, + { + "auxiliary_loss_clip": 0.01157161, + "auxiliary_loss_mlp": 0.01089057, + "balance_loss_clip": 1.03838909, + "balance_loss_mlp": 1.00742269, + "epoch": 0.2581614862021283, + "flos": 14283146810880.0, + "grad_norm": 1.8216335439497053, + "language_loss": 0.88541925, + "learning_rate": 3.478480796979835e-06, + "loss": 0.90788144, + "num_input_tokens_seen": 45886355, + "step": 2147, + "time_per_iteration": 2.6405134201049805 + }, + { + "auxiliary_loss_clip": 0.01132344, + "auxiliary_loss_mlp": 0.0108854, + "balance_loss_clip": 1.03241861, + "balance_loss_mlp": 1.00695276, + "epoch": 0.25828172909276736, + "flos": 29498281856640.0, + "grad_norm": 1.7375985144344144, + "language_loss": 0.78093815, + "learning_rate": 3.4779560929125894e-06, + "loss": 0.80314702, + "num_input_tokens_seen": 45907900, + "step": 2148, + "time_per_iteration": 2.8275277614593506 + }, + { + "auxiliary_loss_clip": 0.0111237, + "auxiliary_loss_mlp": 0.01079698, + "balance_loss_clip": 1.03613329, + "balance_loss_mlp": 0.99997121, + "epoch": 0.2584019719834065, + "flos": 67114387376640.0, + "grad_norm": 0.743393319355985, + "language_loss": 0.56924951, + "learning_rate": 3.4774311646405783e-06, + "loss": 0.59117019, + "num_input_tokens_seen": 45977805, + "step": 2149, + "time_per_iteration": 4.388237714767456 + }, + { + "auxiliary_loss_clip": 0.01125612, + "auxiliary_loss_mlp": 0.01088808, + "balance_loss_clip": 1.03282976, + "balance_loss_mlp": 1.00707793, + "epoch": 0.2585222148740456, + "flos": 22893555634560.0, + "grad_norm": 2.0546039539315246, + "language_loss": 0.83583248, + "learning_rate": 3.476906012243435e-06, + "loss": 0.85797668, + "num_input_tokens_seen": 45996715, + "step": 2150, + "time_per_iteration": 2.871782064437866 + }, + { + "auxiliary_loss_clip": 0.01147273, + "auxiliary_loss_mlp": 0.0108955, + "balance_loss_clip": 1.03681862, + "balance_loss_mlp": 1.00796282, + "epoch": 0.25864245776468464, + "flos": 28909202808960.0, + "grad_norm": 1.4968723310426264, + "language_loss": 0.8102994, + "learning_rate": 3.476380635800824e-06, + "loss": 0.83266759, + "num_input_tokens_seen": 46017915, + "step": 2151, + "time_per_iteration": 2.8716728687286377 + }, + { + "auxiliary_loss_clip": 0.01132022, + "auxiliary_loss_mlp": 0.01089314, + "balance_loss_clip": 1.03275418, + "balance_loss_mlp": 1.00758421, + "epoch": 0.25876270065532375, + "flos": 14793185980800.0, + "grad_norm": 2.0245703424520993, + "language_loss": 0.8599211, + "learning_rate": 3.475855035392444e-06, + "loss": 0.88213444, + "num_input_tokens_seen": 46033235, + "step": 2152, + "time_per_iteration": 3.887197971343994 + }, + { + "auxiliary_loss_clip": 0.01084776, + "auxiliary_loss_mlp": 0.01087379, + "balance_loss_clip": 1.02727985, + "balance_loss_mlp": 1.00564909, + "epoch": 0.25888294354596286, + "flos": 60467821810560.0, + "grad_norm": 1.7088814959861618, + "language_loss": 0.71284783, + "learning_rate": 3.475329211098029e-06, + "loss": 0.73456931, + "num_input_tokens_seen": 46056390, + "step": 2153, + "time_per_iteration": 3.211606025695801 + }, + { + "auxiliary_loss_clip": 0.01115699, + "auxiliary_loss_mlp": 0.01088185, + "balance_loss_clip": 1.03216052, + "balance_loss_mlp": 1.00636005, + "epoch": 0.2590031864366019, + "flos": 27851166771840.0, + "grad_norm": 1.546418217854311, + "language_loss": 0.82244599, + "learning_rate": 3.4748031629973453e-06, + "loss": 0.84448481, + "num_input_tokens_seen": 46077120, + "step": 2154, + "time_per_iteration": 3.9122540950775146 + }, + { + "auxiliary_loss_clip": 0.01093071, + "auxiliary_loss_mlp": 0.01080812, + "balance_loss_clip": 1.03220558, + "balance_loss_mlp": 1.00108469, + "epoch": 0.25912342932724103, + "flos": 62422444206720.0, + "grad_norm": 0.9111303923836285, + "language_loss": 0.56626046, + "learning_rate": 3.4742768911701944e-06, + "loss": 0.58799922, + "num_input_tokens_seen": 46139815, + "step": 2155, + "time_per_iteration": 3.473572015762329 + }, + { + "auxiliary_loss_clip": 0.01144251, + "auxiliary_loss_mlp": 0.01091632, + "balance_loss_clip": 1.03533268, + "balance_loss_mlp": 1.00966346, + "epoch": 0.25924367221788014, + "flos": 12378839368320.0, + "grad_norm": 2.507835205290458, + "language_loss": 0.7033509, + "learning_rate": 3.4737503956964113e-06, + "loss": 0.72570968, + "num_input_tokens_seen": 46152120, + "step": 2156, + "time_per_iteration": 2.7460803985595703 + }, + { + "auxiliary_loss_clip": 0.011336, + "auxiliary_loss_mlp": 0.01090261, + "balance_loss_clip": 1.03233218, + "balance_loss_mlp": 1.00824499, + "epoch": 0.2593639151085192, + "flos": 14575208296320.0, + "grad_norm": 3.0448886504760875, + "language_loss": 0.67662328, + "learning_rate": 3.473223676655865e-06, + "loss": 0.6988619, + "num_input_tokens_seen": 46170120, + "step": 2157, + "time_per_iteration": 3.710432767868042 + }, + { + "auxiliary_loss_clip": 0.01137962, + "auxiliary_loss_mlp": 0.01089515, + "balance_loss_clip": 1.03509057, + "balance_loss_mlp": 1.0077852, + "epoch": 0.2594841579991583, + "flos": 15230937029760.0, + "grad_norm": 2.1373561827762186, + "language_loss": 0.7937488, + "learning_rate": 3.472696734128459e-06, + "loss": 0.81602359, + "num_input_tokens_seen": 46187985, + "step": 2158, + "time_per_iteration": 2.7326698303222656 + }, + { + "auxiliary_loss_clip": 0.01144192, + "auxiliary_loss_mlp": 0.01090659, + "balance_loss_clip": 1.0350194, + "balance_loss_mlp": 1.00897706, + "epoch": 0.2596044008897974, + "flos": 23623583650560.0, + "grad_norm": 1.8066212845942318, + "language_loss": 0.75660896, + "learning_rate": 3.4721695681941286e-06, + "loss": 0.77895743, + "num_input_tokens_seen": 46207025, + "step": 2159, + "time_per_iteration": 2.7963054180145264 + }, + { + "auxiliary_loss_clip": 0.01132072, + "auxiliary_loss_mlp": 0.00874016, + "balance_loss_clip": 1.03247833, + "balance_loss_mlp": 1.00008345, + "epoch": 0.25972464378043647, + "flos": 13772281628160.0, + "grad_norm": 1.7900305176882354, + "language_loss": 0.81931895, + "learning_rate": 3.471642178932845e-06, + "loss": 0.83937985, + "num_input_tokens_seen": 46225670, + "step": 2160, + "time_per_iteration": 2.8647122383117676 + }, + { + "auxiliary_loss_clip": 0.01134766, + "auxiliary_loss_mlp": 0.01089485, + "balance_loss_clip": 1.03320527, + "balance_loss_mlp": 1.00785017, + "epoch": 0.2598448866710756, + "flos": 19573578391680.0, + "grad_norm": 2.2880462383006086, + "language_loss": 0.8929019, + "learning_rate": 3.471114566424613e-06, + "loss": 0.91514438, + "num_input_tokens_seen": 46244130, + "step": 2161, + "time_per_iteration": 2.8615193367004395 + }, + { + "auxiliary_loss_clip": 0.01133939, + "auxiliary_loss_mlp": 0.01089395, + "balance_loss_clip": 1.03396845, + "balance_loss_mlp": 1.00785601, + "epoch": 0.25996512956171464, + "flos": 21653237053440.0, + "grad_norm": 1.8238975907775385, + "language_loss": 0.7559644, + "learning_rate": 3.4705867307494715e-06, + "loss": 0.77819777, + "num_input_tokens_seen": 46263200, + "step": 2162, + "time_per_iteration": 2.764451503753662 + }, + { + "auxiliary_loss_clip": 0.01146473, + "auxiliary_loss_mlp": 0.01090526, + "balance_loss_clip": 1.03561592, + "balance_loss_mlp": 1.00874901, + "epoch": 0.26008537245235375, + "flos": 18223480869120.0, + "grad_norm": 2.6797679108784087, + "language_loss": 0.85291797, + "learning_rate": 3.470058671987492e-06, + "loss": 0.87528801, + "num_input_tokens_seen": 46281465, + "step": 2163, + "time_per_iteration": 2.787665367126465 + }, + { + "auxiliary_loss_clip": 0.01147738, + "auxiliary_loss_mlp": 0.01089807, + "balance_loss_clip": 1.03602421, + "balance_loss_mlp": 1.00788641, + "epoch": 0.26020561534299286, + "flos": 24645385843200.0, + "grad_norm": 1.6932207427409283, + "language_loss": 0.84452224, + "learning_rate": 3.4695303902187805e-06, + "loss": 0.86689764, + "num_input_tokens_seen": 46301020, + "step": 2164, + "time_per_iteration": 2.801236391067505 + }, + { + "auxiliary_loss_clip": 0.0111906, + "auxiliary_loss_mlp": 0.0108808, + "balance_loss_clip": 1.03499269, + "balance_loss_mlp": 1.00649273, + "epoch": 0.2603258582336319, + "flos": 25773662926080.0, + "grad_norm": 1.9128198898378588, + "language_loss": 0.78126371, + "learning_rate": 3.469001885523478e-06, + "loss": 0.80333513, + "num_input_tokens_seen": 46321740, + "step": 2165, + "time_per_iteration": 2.8077809810638428 + }, + { + "auxiliary_loss_clip": 0.01154932, + "auxiliary_loss_mlp": 0.0108944, + "balance_loss_clip": 1.03618264, + "balance_loss_mlp": 1.00775743, + "epoch": 0.260446101124271, + "flos": 28766314506240.0, + "grad_norm": 1.5419195902453557, + "language_loss": 0.80941856, + "learning_rate": 3.4684731579817568e-06, + "loss": 0.83186227, + "num_input_tokens_seen": 46342730, + "step": 2166, + "time_per_iteration": 2.8143627643585205 + }, + { + "auxiliary_loss_clip": 0.01096276, + "auxiliary_loss_mlp": 0.01087805, + "balance_loss_clip": 1.02939415, + "balance_loss_mlp": 1.00612295, + "epoch": 0.26056634401491013, + "flos": 25666757072640.0, + "grad_norm": 1.5482654416981119, + "language_loss": 0.76670045, + "learning_rate": 3.4679442076738247e-06, + "loss": 0.78854132, + "num_input_tokens_seen": 46362445, + "step": 2167, + "time_per_iteration": 2.9615871906280518 + }, + { + "auxiliary_loss_clip": 0.0115558, + "auxiliary_loss_mlp": 0.01087409, + "balance_loss_clip": 1.03672004, + "balance_loss_mlp": 1.00567889, + "epoch": 0.2606865869055492, + "flos": 27052765217280.0, + "grad_norm": 2.125863093163849, + "language_loss": 0.83254021, + "learning_rate": 3.4674150346799245e-06, + "loss": 0.8549701, + "num_input_tokens_seen": 46382145, + "step": 2168, + "time_per_iteration": 2.7995097637176514 + }, + { + "auxiliary_loss_clip": 0.01127534, + "auxiliary_loss_mlp": 0.01088514, + "balance_loss_clip": 1.0319922, + "balance_loss_mlp": 1.00649762, + "epoch": 0.2608068297961883, + "flos": 17712615686400.0, + "grad_norm": 2.9528672106574563, + "language_loss": 0.79798043, + "learning_rate": 3.4668856390803295e-06, + "loss": 0.8201409, + "num_input_tokens_seen": 46400025, + "step": 2169, + "time_per_iteration": 2.7135043144226074 + }, + { + "auxiliary_loss_clip": 0.01145701, + "auxiliary_loss_mlp": 0.01087152, + "balance_loss_clip": 1.03614688, + "balance_loss_mlp": 1.00551724, + "epoch": 0.2609270726868274, + "flos": 18551632544640.0, + "grad_norm": 1.955307154844769, + "language_loss": 0.90025985, + "learning_rate": 3.4663560209553495e-06, + "loss": 0.92258835, + "num_input_tokens_seen": 46418090, + "step": 2170, + "time_per_iteration": 2.786115884780884 + }, + { + "auxiliary_loss_clip": 0.01135948, + "auxiliary_loss_mlp": 0.01089787, + "balance_loss_clip": 1.03466117, + "balance_loss_mlp": 1.00805759, + "epoch": 0.26104731557746647, + "flos": 21835699165440.0, + "grad_norm": 1.6702738743438525, + "language_loss": 0.79106343, + "learning_rate": 3.4658261803853267e-06, + "loss": 0.81332076, + "num_input_tokens_seen": 46436015, + "step": 2171, + "time_per_iteration": 2.9138083457946777 + }, + { + "auxiliary_loss_clip": 0.01132497, + "auxiliary_loss_mlp": 0.01086881, + "balance_loss_clip": 1.03335381, + "balance_loss_mlp": 1.00515115, + "epoch": 0.2611675584681056, + "flos": 21689650465920.0, + "grad_norm": 1.8886110600281156, + "language_loss": 0.80757403, + "learning_rate": 3.4652961174506383e-06, + "loss": 0.82976782, + "num_input_tokens_seen": 46455885, + "step": 2172, + "time_per_iteration": 2.7153844833374023 + }, + { + "auxiliary_loss_clip": 0.01124248, + "auxiliary_loss_mlp": 0.01080379, + "balance_loss_clip": 1.03882086, + "balance_loss_mlp": 1.0006516, + "epoch": 0.2612878013587447, + "flos": 71862101389440.0, + "grad_norm": 0.9776899550449547, + "language_loss": 0.5819397, + "learning_rate": 3.464765832231694e-06, + "loss": 0.60398591, + "num_input_tokens_seen": 46510050, + "step": 2173, + "time_per_iteration": 3.2967653274536133 + }, + { + "auxiliary_loss_clip": 0.01146058, + "auxiliary_loss_mlp": 0.01088969, + "balance_loss_clip": 1.036731, + "balance_loss_mlp": 1.007478, + "epoch": 0.26140804424938374, + "flos": 20227511445120.0, + "grad_norm": 1.785321347251858, + "language_loss": 0.70664108, + "learning_rate": 3.4642353248089373e-06, + "loss": 0.72899133, + "num_input_tokens_seen": 46528810, + "step": 2174, + "time_per_iteration": 3.8698477745056152 + }, + { + "auxiliary_loss_clip": 0.01135738, + "auxiliary_loss_mlp": 0.01088898, + "balance_loss_clip": 1.03450203, + "balance_loss_mlp": 1.00712097, + "epoch": 0.26152828714002285, + "flos": 25557085872000.0, + "grad_norm": 1.7823237573844846, + "language_loss": 0.80607355, + "learning_rate": 3.463704595262846e-06, + "loss": 0.82831991, + "num_input_tokens_seen": 46549690, + "step": 2175, + "time_per_iteration": 2.8995041847229004 + }, + { + "auxiliary_loss_clip": 0.01121982, + "auxiliary_loss_mlp": 0.01089188, + "balance_loss_clip": 1.03246629, + "balance_loss_mlp": 1.00755334, + "epoch": 0.26164853003066196, + "flos": 25446516831360.0, + "grad_norm": 1.8334598196641172, + "language_loss": 0.70635486, + "learning_rate": 3.463173643673931e-06, + "loss": 0.72846657, + "num_input_tokens_seen": 46572215, + "step": 2176, + "time_per_iteration": 2.947526216506958 + }, + { + "auxiliary_loss_clip": 0.01146513, + "auxiliary_loss_mlp": 0.01080255, + "balance_loss_clip": 1.0502758, + "balance_loss_mlp": 1.00052774, + "epoch": 0.261768772921301, + "flos": 53944580568960.0, + "grad_norm": 0.8988794579089813, + "language_loss": 0.63469613, + "learning_rate": 3.4626424701227387e-06, + "loss": 0.65696383, + "num_input_tokens_seen": 46627275, + "step": 2177, + "time_per_iteration": 3.3083603382110596 + }, + { + "auxiliary_loss_clip": 0.01154046, + "auxiliary_loss_mlp": 0.0108024, + "balance_loss_clip": 1.05010295, + "balance_loss_mlp": 1.0005126, + "epoch": 0.26188901581194013, + "flos": 70687606481280.0, + "grad_norm": 0.8239835043561283, + "language_loss": 0.5578649, + "learning_rate": 3.4621110746898452e-06, + "loss": 0.58020777, + "num_input_tokens_seen": 46695135, + "step": 2178, + "time_per_iteration": 4.949493885040283 + }, + { + "auxiliary_loss_clip": 0.0114756, + "auxiliary_loss_mlp": 0.01088309, + "balance_loss_clip": 1.03697681, + "balance_loss_mlp": 1.0068171, + "epoch": 0.2620092587025792, + "flos": 21069580959360.0, + "grad_norm": 2.8275796996958467, + "language_loss": 0.74647385, + "learning_rate": 3.4615794574558654e-06, + "loss": 0.76883256, + "num_input_tokens_seen": 46714145, + "step": 2179, + "time_per_iteration": 3.8087637424468994 + }, + { + "auxiliary_loss_clip": 0.01132405, + "auxiliary_loss_mlp": 0.0108889, + "balance_loss_clip": 1.03371859, + "balance_loss_mlp": 1.00720811, + "epoch": 0.2621295015932183, + "flos": 18369601395840.0, + "grad_norm": 2.043516144182347, + "language_loss": 0.83786917, + "learning_rate": 3.4610476185014436e-06, + "loss": 0.86008209, + "num_input_tokens_seen": 46731405, + "step": 2180, + "time_per_iteration": 2.8328723907470703 + }, + { + "auxiliary_loss_clip": 0.0115551, + "auxiliary_loss_mlp": 0.01088458, + "balance_loss_clip": 1.03690958, + "balance_loss_mlp": 1.00663233, + "epoch": 0.2622497444838574, + "flos": 23659997063040.0, + "grad_norm": 1.973886634000373, + "language_loss": 0.79463547, + "learning_rate": 3.4605155579072597e-06, + "loss": 0.81707513, + "num_input_tokens_seen": 46751260, + "step": 2181, + "time_per_iteration": 2.748708486557007 + }, + { + "auxiliary_loss_clip": 0.01113356, + "auxiliary_loss_mlp": 0.01090908, + "balance_loss_clip": 1.0304594, + "balance_loss_mlp": 1.00922608, + "epoch": 0.26236998737449646, + "flos": 22123810154880.0, + "grad_norm": 1.7879481666665855, + "language_loss": 0.71375477, + "learning_rate": 3.459983275754027e-06, + "loss": 0.73579741, + "num_input_tokens_seen": 46770155, + "step": 2182, + "time_per_iteration": 3.770838975906372 + }, + { + "auxiliary_loss_clip": 0.01156167, + "auxiliary_loss_mlp": 0.01089376, + "balance_loss_clip": 1.03748322, + "balance_loss_mlp": 1.00759804, + "epoch": 0.26249023026513557, + "flos": 17895185539200.0, + "grad_norm": 2.4541460061285325, + "language_loss": 0.79939091, + "learning_rate": 3.4594507721224918e-06, + "loss": 0.82184631, + "num_input_tokens_seen": 46788805, + "step": 2183, + "time_per_iteration": 2.715806722640991 + }, + { + "auxiliary_loss_clip": 0.01138917, + "auxiliary_loss_mlp": 0.01089576, + "balance_loss_clip": 1.03720546, + "balance_loss_mlp": 1.00784612, + "epoch": 0.2626104731557747, + "flos": 18332936588160.0, + "grad_norm": 1.5586068521599965, + "language_loss": 0.81472468, + "learning_rate": 3.4589180470934353e-06, + "loss": 0.83700955, + "num_input_tokens_seen": 46808670, + "step": 2184, + "time_per_iteration": 2.7797739505767822 + }, + { + "auxiliary_loss_clip": 0.01136523, + "auxiliary_loss_mlp": 0.0108669, + "balance_loss_clip": 1.03550887, + "balance_loss_mlp": 1.00491285, + "epoch": 0.26273071604641374, + "flos": 19317714837120.0, + "grad_norm": 1.7489970816449685, + "language_loss": 0.76436424, + "learning_rate": 3.4583851007476713e-06, + "loss": 0.78659642, + "num_input_tokens_seen": 46827140, + "step": 2185, + "time_per_iteration": 2.712574005126953 + }, + { + "auxiliary_loss_clip": 0.01121218, + "auxiliary_loss_mlp": 0.01088724, + "balance_loss_clip": 1.03117108, + "balance_loss_mlp": 1.00689828, + "epoch": 0.26285095893705285, + "flos": 18327477720960.0, + "grad_norm": 2.6192161355970067, + "language_loss": 0.68775845, + "learning_rate": 3.4578519331660464e-06, + "loss": 0.70985788, + "num_input_tokens_seen": 46844135, + "step": 2186, + "time_per_iteration": 2.766101360321045 + }, + { + "auxiliary_loss_clip": 0.01144948, + "auxiliary_loss_mlp": 0.01088039, + "balance_loss_clip": 1.03613782, + "balance_loss_mlp": 1.00640488, + "epoch": 0.26297120182769196, + "flos": 20193827466240.0, + "grad_norm": 1.74142024312288, + "language_loss": 0.81982964, + "learning_rate": 3.4573185444294426e-06, + "loss": 0.84215951, + "num_input_tokens_seen": 46862500, + "step": 2187, + "time_per_iteration": 2.789726734161377 + }, + { + "auxiliary_loss_clip": 0.01138583, + "auxiliary_loss_mlp": 0.00874118, + "balance_loss_clip": 1.03629231, + "balance_loss_mlp": 1.000193, + "epoch": 0.263091444718331, + "flos": 22418421505920.0, + "grad_norm": 1.6232946141448787, + "language_loss": 0.78685582, + "learning_rate": 3.456784934618774e-06, + "loss": 0.80698282, + "num_input_tokens_seen": 46883665, + "step": 2188, + "time_per_iteration": 2.7946956157684326 + }, + { + "auxiliary_loss_clip": 0.01136855, + "auxiliary_loss_mlp": 0.01088157, + "balance_loss_clip": 1.03464532, + "balance_loss_mlp": 1.00661826, + "epoch": 0.2632116876089701, + "flos": 19024827338880.0, + "grad_norm": 2.271224700733484, + "language_loss": 0.79865098, + "learning_rate": 3.4562511038149897e-06, + "loss": 0.82090116, + "num_input_tokens_seen": 46899160, + "step": 2189, + "time_per_iteration": 2.8051042556762695 + }, + { + "auxiliary_loss_clip": 0.01099999, + "auxiliary_loss_mlp": 0.01080038, + "balance_loss_clip": 1.03229284, + "balance_loss_mlp": 1.00031126, + "epoch": 0.26333193049960923, + "flos": 67308054531840.0, + "grad_norm": 0.8553467880879441, + "language_loss": 0.57746315, + "learning_rate": 3.4557170520990705e-06, + "loss": 0.59926355, + "num_input_tokens_seen": 46959835, + "step": 2190, + "time_per_iteration": 3.5305395126342773 + }, + { + "auxiliary_loss_clip": 0.01145003, + "auxiliary_loss_mlp": 0.01087554, + "balance_loss_clip": 1.03635716, + "balance_loss_mlp": 1.00606203, + "epoch": 0.2634521733902483, + "flos": 25048806468480.0, + "grad_norm": 1.3966644909110129, + "language_loss": 0.86282861, + "learning_rate": 3.4551827795520324e-06, + "loss": 0.88515419, + "num_input_tokens_seen": 46982720, + "step": 2191, + "time_per_iteration": 2.7560651302337646 + }, + { + "auxiliary_loss_clip": 0.011459, + "auxiliary_loss_mlp": 0.01089145, + "balance_loss_clip": 1.03577113, + "balance_loss_mlp": 1.00741518, + "epoch": 0.2635724162808874, + "flos": 20594985534720.0, + "grad_norm": 1.7288554613240832, + "language_loss": 0.85029924, + "learning_rate": 3.4546482862549226e-06, + "loss": 0.87264967, + "num_input_tokens_seen": 47003035, + "step": 2192, + "time_per_iteration": 2.910161018371582 + }, + { + "auxiliary_loss_clip": 0.01120331, + "auxiliary_loss_mlp": 0.01087925, + "balance_loss_clip": 1.03288043, + "balance_loss_mlp": 1.00610018, + "epoch": 0.2636926591715265, + "flos": 19244636616960.0, + "grad_norm": 4.853658939565863, + "language_loss": 0.78786421, + "learning_rate": 3.4541135722888253e-06, + "loss": 0.80994678, + "num_input_tokens_seen": 47019625, + "step": 2193, + "time_per_iteration": 2.8903443813323975 + }, + { + "auxiliary_loss_clip": 0.01153653, + "auxiliary_loss_mlp": 0.01089292, + "balance_loss_clip": 1.03525758, + "balance_loss_mlp": 1.0075618, + "epoch": 0.26381290206216557, + "flos": 28804882734720.0, + "grad_norm": 5.98465108176121, + "language_loss": 0.79981691, + "learning_rate": 3.453578637734854e-06, + "loss": 0.82224637, + "num_input_tokens_seen": 47040815, + "step": 2194, + "time_per_iteration": 2.7797865867614746 + }, + { + "auxiliary_loss_clip": 0.01157613, + "auxiliary_loss_mlp": 0.01091291, + "balance_loss_clip": 1.03956628, + "balance_loss_mlp": 1.00951338, + "epoch": 0.2639331449528047, + "flos": 25008909436800.0, + "grad_norm": 1.5277916363258504, + "language_loss": 0.78380686, + "learning_rate": 3.4530434826741605e-06, + "loss": 0.80629593, + "num_input_tokens_seen": 47061755, + "step": 2195, + "time_per_iteration": 2.713595151901245 + }, + { + "auxiliary_loss_clip": 0.01133225, + "auxiliary_loss_mlp": 0.01088167, + "balance_loss_clip": 1.0340451, + "balance_loss_mlp": 1.00662792, + "epoch": 0.26405338784344373, + "flos": 46535775465600.0, + "grad_norm": 1.6054863216795352, + "language_loss": 0.68856907, + "learning_rate": 3.452508107187926e-06, + "loss": 0.71078295, + "num_input_tokens_seen": 47085130, + "step": 2196, + "time_per_iteration": 2.927546501159668 + }, + { + "auxiliary_loss_clip": 0.01106761, + "auxiliary_loss_mlp": 0.01089295, + "balance_loss_clip": 1.03214931, + "balance_loss_mlp": 1.00742209, + "epoch": 0.26417363073408284, + "flos": 21179467641600.0, + "grad_norm": 1.7277305519183157, + "language_loss": 0.77187109, + "learning_rate": 3.451972511357366e-06, + "loss": 0.79383165, + "num_input_tokens_seen": 47104675, + "step": 2197, + "time_per_iteration": 2.968546152114868 + }, + { + "auxiliary_loss_clip": 0.01142955, + "auxiliary_loss_mlp": 0.01088271, + "balance_loss_clip": 1.03514922, + "balance_loss_mlp": 1.00682712, + "epoch": 0.26429387362472195, + "flos": 22674751937280.0, + "grad_norm": 1.5954300936287527, + "language_loss": 0.85098916, + "learning_rate": 3.45143669526373e-06, + "loss": 0.87330151, + "num_input_tokens_seen": 47124435, + "step": 2198, + "time_per_iteration": 2.7236833572387695 + }, + { + "auxiliary_loss_clip": 0.01133335, + "auxiliary_loss_mlp": 0.0108006, + "balance_loss_clip": 1.04616606, + "balance_loss_mlp": 1.00033319, + "epoch": 0.264414116515361, + "flos": 67180534272000.0, + "grad_norm": 0.8019614688251542, + "language_loss": 0.6327185, + "learning_rate": 3.450900658988302e-06, + "loss": 0.65485245, + "num_input_tokens_seen": 47185985, + "step": 2199, + "time_per_iteration": 3.2529094219207764 + }, + { + "auxiliary_loss_clip": 0.01135748, + "auxiliary_loss_mlp": 0.0108946, + "balance_loss_clip": 1.03560519, + "balance_loss_mlp": 1.0076344, + "epoch": 0.2645343594060001, + "flos": 25664709997440.0, + "grad_norm": 1.915840155041895, + "language_loss": 0.77586818, + "learning_rate": 3.450364402612397e-06, + "loss": 0.79812026, + "num_input_tokens_seen": 47203140, + "step": 2200, + "time_per_iteration": 3.8356151580810547 + }, + { + "auxiliary_loss_clip": 0.01128808, + "auxiliary_loss_mlp": 0.01087981, + "balance_loss_clip": 1.03114223, + "balance_loss_mlp": 1.00620365, + "epoch": 0.26465460229663923, + "flos": 22491822948480.0, + "grad_norm": 1.8217540494202487, + "language_loss": 0.83974266, + "learning_rate": 3.449827926217366e-06, + "loss": 0.86191052, + "num_input_tokens_seen": 47222575, + "step": 2201, + "time_per_iteration": 2.7528817653656006 + }, + { + "auxiliary_loss_clip": 0.01135, + "auxiliary_loss_mlp": 0.0108852, + "balance_loss_clip": 1.03286004, + "balance_loss_mlp": 1.00683749, + "epoch": 0.2647748451872783, + "flos": 29388036038400.0, + "grad_norm": 1.7065596527488707, + "language_loss": 0.80433881, + "learning_rate": 3.449291229884591e-06, + "loss": 0.82657397, + "num_input_tokens_seen": 47243815, + "step": 2202, + "time_per_iteration": 2.819441080093384 + }, + { + "auxiliary_loss_clip": 0.01124886, + "auxiliary_loss_mlp": 0.01087941, + "balance_loss_clip": 1.03319001, + "balance_loss_mlp": 1.00635421, + "epoch": 0.2648950880779174, + "flos": 26797799502720.0, + "grad_norm": 1.861475402778703, + "language_loss": 0.86616939, + "learning_rate": 3.4487543136954887e-06, + "loss": 0.88829768, + "num_input_tokens_seen": 47263435, + "step": 2203, + "time_per_iteration": 4.2410218715667725 + }, + { + "auxiliary_loss_clip": 0.01122803, + "auxiliary_loss_mlp": 0.01088631, + "balance_loss_clip": 1.03204215, + "balance_loss_mlp": 1.00704455, + "epoch": 0.2650153309685565, + "flos": 28841008838400.0, + "grad_norm": 1.639265317230153, + "language_loss": 0.90904665, + "learning_rate": 3.448217177731509e-06, + "loss": 0.93116105, + "num_input_tokens_seen": 47283920, + "step": 2204, + "time_per_iteration": 3.748997449874878 + }, + { + "auxiliary_loss_clip": 0.01125435, + "auxiliary_loss_mlp": 0.01088911, + "balance_loss_clip": 1.0327239, + "balance_loss_mlp": 1.00737226, + "epoch": 0.26513557385919556, + "flos": 20303247271680.0, + "grad_norm": 9.91840319987265, + "language_loss": 0.77662724, + "learning_rate": 3.4476798220741348e-06, + "loss": 0.79877073, + "num_input_tokens_seen": 47302800, + "step": 2205, + "time_per_iteration": 2.7692954540252686 + }, + { + "auxiliary_loss_clip": 0.01155497, + "auxiliary_loss_mlp": 0.01090888, + "balance_loss_clip": 1.03754485, + "balance_loss_mlp": 1.0092535, + "epoch": 0.26525581674983467, + "flos": 17676274101120.0, + "grad_norm": 1.6170747711469469, + "language_loss": 0.78266495, + "learning_rate": 3.4471422468048826e-06, + "loss": 0.80512881, + "num_input_tokens_seen": 47321525, + "step": 2206, + "time_per_iteration": 2.683138608932495 + }, + { + "auxiliary_loss_clip": 0.01147988, + "auxiliary_loss_mlp": 0.01088933, + "balance_loss_clip": 1.03854072, + "balance_loss_mlp": 1.00720263, + "epoch": 0.2653760596404738, + "flos": 26833746038400.0, + "grad_norm": 4.974944627495642, + "language_loss": 0.73146832, + "learning_rate": 3.4466044520053022e-06, + "loss": 0.75383759, + "num_input_tokens_seen": 47340530, + "step": 2207, + "time_per_iteration": 3.910465955734253 + }, + { + "auxiliary_loss_clip": 0.01136662, + "auxiliary_loss_mlp": 0.01089655, + "balance_loss_clip": 1.03509545, + "balance_loss_mlp": 1.00797296, + "epoch": 0.26549630253111284, + "flos": 22782160581120.0, + "grad_norm": 3.049118035521412, + "language_loss": 0.60330176, + "learning_rate": 3.446066437756977e-06, + "loss": 0.62556493, + "num_input_tokens_seen": 47359735, + "step": 2208, + "time_per_iteration": 2.9149703979492188 + }, + { + "auxiliary_loss_clip": 0.01132554, + "auxiliary_loss_mlp": 0.01087103, + "balance_loss_clip": 1.03366959, + "balance_loss_mlp": 1.00551617, + "epoch": 0.26561654542175195, + "flos": 23550002640000.0, + "grad_norm": 1.98235745333818, + "language_loss": 0.74660707, + "learning_rate": 3.4455282041415224e-06, + "loss": 0.7688036, + "num_input_tokens_seen": 47378945, + "step": 2209, + "time_per_iteration": 2.798737049102783 + }, + { + "auxiliary_loss_clip": 0.01121753, + "auxiliary_loss_mlp": 0.01091184, + "balance_loss_clip": 1.0316819, + "balance_loss_mlp": 1.00945377, + "epoch": 0.265736788312391, + "flos": 26906680604160.0, + "grad_norm": 1.9019214223158372, + "language_loss": 0.86789966, + "learning_rate": 3.4449897512405894e-06, + "loss": 0.89002907, + "num_input_tokens_seen": 47398095, + "step": 2210, + "time_per_iteration": 2.9429609775543213 + }, + { + "auxiliary_loss_clip": 0.01103518, + "auxiliary_loss_mlp": 0.00873975, + "balance_loss_clip": 1.02981603, + "balance_loss_mlp": 1.00024366, + "epoch": 0.2658570312030301, + "flos": 23477139901440.0, + "grad_norm": 1.8888268154237449, + "language_loss": 0.75272661, + "learning_rate": 3.444451079135859e-06, + "loss": 0.77250159, + "num_input_tokens_seen": 47417605, + "step": 2211, + "time_per_iteration": 3.003361701965332 + }, + { + "auxiliary_loss_clip": 0.01113336, + "auxiliary_loss_mlp": 0.00874033, + "balance_loss_clip": 1.03094673, + "balance_loss_mlp": 1.00011468, + "epoch": 0.2659772740936692, + "flos": 21866402315520.0, + "grad_norm": 1.8904157135266364, + "language_loss": 0.73430389, + "learning_rate": 3.4439121879090493e-06, + "loss": 0.75417757, + "num_input_tokens_seen": 47435385, + "step": 2212, + "time_per_iteration": 2.8775007724761963 + }, + { + "auxiliary_loss_clip": 0.01138533, + "auxiliary_loss_mlp": 0.01087878, + "balance_loss_clip": 1.03710103, + "balance_loss_mlp": 1.00633836, + "epoch": 0.2660975169843083, + "flos": 19793100360960.0, + "grad_norm": 1.9591950648456669, + "language_loss": 0.83870852, + "learning_rate": 3.4433730776419082e-06, + "loss": 0.86097264, + "num_input_tokens_seen": 47454310, + "step": 2213, + "time_per_iteration": 2.8814258575439453 + }, + { + "auxiliary_loss_clip": 0.01144897, + "auxiliary_loss_mlp": 0.0087401, + "balance_loss_clip": 1.03540301, + "balance_loss_mlp": 1.00019991, + "epoch": 0.2662177598749474, + "flos": 29018981750400.0, + "grad_norm": 1.9860162682553417, + "language_loss": 0.80400872, + "learning_rate": 3.4428337484162183e-06, + "loss": 0.82419789, + "num_input_tokens_seen": 47475120, + "step": 2214, + "time_per_iteration": 2.7986042499542236 + }, + { + "auxiliary_loss_clip": 0.01137498, + "auxiliary_loss_mlp": 0.0108867, + "balance_loss_clip": 1.03588462, + "balance_loss_mlp": 1.00708306, + "epoch": 0.2663380027655865, + "flos": 21762549118080.0, + "grad_norm": 1.7683980347106094, + "language_loss": 0.83946395, + "learning_rate": 3.442294200313797e-06, + "loss": 0.86172557, + "num_input_tokens_seen": 47493150, + "step": 2215, + "time_per_iteration": 2.9435665607452393 + }, + { + "auxiliary_loss_clip": 0.01148328, + "auxiliary_loss_mlp": 0.01079881, + "balance_loss_clip": 1.04544449, + "balance_loss_mlp": 1.00015354, + "epoch": 0.26645824565622556, + "flos": 66980333819520.0, + "grad_norm": 0.7604947765065557, + "language_loss": 0.5271163, + "learning_rate": 3.4417544334164916e-06, + "loss": 0.54939842, + "num_input_tokens_seen": 47557295, + "step": 2216, + "time_per_iteration": 3.3321545124053955 + }, + { + "auxiliary_loss_clip": 0.01115987, + "auxiliary_loss_mlp": 0.0108767, + "balance_loss_clip": 1.03105664, + "balance_loss_mlp": 1.00627422, + "epoch": 0.26657848854686467, + "flos": 25264198373760.0, + "grad_norm": 1.6128029437081752, + "language_loss": 0.77390963, + "learning_rate": 3.4412144478061854e-06, + "loss": 0.79594618, + "num_input_tokens_seen": 47579705, + "step": 2217, + "time_per_iteration": 2.9489240646362305 + }, + { + "auxiliary_loss_clip": 0.01082549, + "auxiliary_loss_mlp": 0.01089529, + "balance_loss_clip": 1.02698171, + "balance_loss_mlp": 1.00784659, + "epoch": 0.2666987314375038, + "flos": 23696769611520.0, + "grad_norm": 2.265357492857357, + "language_loss": 0.75544667, + "learning_rate": 3.4406742435647925e-06, + "loss": 0.77716744, + "num_input_tokens_seen": 47599770, + "step": 2218, + "time_per_iteration": 3.2703874111175537 + }, + { + "auxiliary_loss_clip": 0.01142043, + "auxiliary_loss_mlp": 0.0108702, + "balance_loss_clip": 1.03403854, + "balance_loss_mlp": 1.00562382, + "epoch": 0.26681897432814283, + "flos": 27048958375680.0, + "grad_norm": 1.7409901393425304, + "language_loss": 0.78549838, + "learning_rate": 3.440133820774263e-06, + "loss": 0.80778897, + "num_input_tokens_seen": 47619580, + "step": 2219, + "time_per_iteration": 2.9764883518218994 + }, + { + "auxiliary_loss_clip": 0.01134471, + "auxiliary_loss_mlp": 0.01088756, + "balance_loss_clip": 1.03401661, + "balance_loss_mlp": 1.00702584, + "epoch": 0.26693921721878194, + "flos": 28985944216320.0, + "grad_norm": 2.273797311098203, + "language_loss": 0.81830347, + "learning_rate": 3.439593179516578e-06, + "loss": 0.84053576, + "num_input_tokens_seen": 47639490, + "step": 2220, + "time_per_iteration": 2.857433795928955 + }, + { + "auxiliary_loss_clip": 0.01125118, + "auxiliary_loss_mlp": 0.01088857, + "balance_loss_clip": 1.03528786, + "balance_loss_mlp": 1.00707936, + "epoch": 0.26705946010942105, + "flos": 21507834798720.0, + "grad_norm": 1.872429143866718, + "language_loss": 0.80855787, + "learning_rate": 3.4390523198737524e-06, + "loss": 0.8306976, + "num_input_tokens_seen": 47658650, + "step": 2221, + "time_per_iteration": 2.8090672492980957 + }, + { + "auxiliary_loss_clip": 0.01155359, + "auxiliary_loss_mlp": 0.00873959, + "balance_loss_clip": 1.03734589, + "balance_loss_mlp": 1.00020194, + "epoch": 0.2671797030000601, + "flos": 21471277731840.0, + "grad_norm": 1.8470064012089549, + "language_loss": 0.73552084, + "learning_rate": 3.4385112419278333e-06, + "loss": 0.75581402, + "num_input_tokens_seen": 47679875, + "step": 2222, + "time_per_iteration": 2.7424745559692383 + }, + { + "auxiliary_loss_clip": 0.01140467, + "auxiliary_loss_mlp": 0.01079714, + "balance_loss_clip": 1.04541254, + "balance_loss_mlp": 0.99998683, + "epoch": 0.2672999458906992, + "flos": 64189929767040.0, + "grad_norm": 0.790986088753665, + "language_loss": 0.64813715, + "learning_rate": 3.4379699457609033e-06, + "loss": 0.67033899, + "num_input_tokens_seen": 47737700, + "step": 2223, + "time_per_iteration": 3.1728107929229736 + }, + { + "auxiliary_loss_clip": 0.01131902, + "auxiliary_loss_mlp": 0.01090073, + "balance_loss_clip": 1.03168344, + "balance_loss_mlp": 1.00819969, + "epoch": 0.26742018878133833, + "flos": 16909042573440.0, + "grad_norm": 1.6828962859237087, + "language_loss": 0.90093195, + "learning_rate": 3.4374284314550755e-06, + "loss": 0.92315167, + "num_input_tokens_seen": 47756740, + "step": 2224, + "time_per_iteration": 2.7794535160064697 + }, + { + "auxiliary_loss_clip": 0.01153878, + "auxiliary_loss_mlp": 0.0108808, + "balance_loss_clip": 1.0364486, + "balance_loss_mlp": 1.00649285, + "epoch": 0.2675404316719774, + "flos": 20667560964480.0, + "grad_norm": 2.9697289394026316, + "language_loss": 0.8071292, + "learning_rate": 3.436886699092498e-06, + "loss": 0.82954884, + "num_input_tokens_seen": 47775255, + "step": 2225, + "time_per_iteration": 3.5715904235839844 + }, + { + "auxiliary_loss_clip": 0.01154878, + "auxiliary_loss_mlp": 0.01088575, + "balance_loss_clip": 1.03699124, + "balance_loss_mlp": 1.00684547, + "epoch": 0.2676606745626165, + "flos": 17485013157120.0, + "grad_norm": 2.2918946743253494, + "language_loss": 0.71207088, + "learning_rate": 3.4363447487553502e-06, + "loss": 0.73450541, + "num_input_tokens_seen": 47788570, + "step": 2226, + "time_per_iteration": 2.599520206451416 + }, + { + "auxiliary_loss_clip": 0.01124143, + "auxiliary_loss_mlp": 0.01088629, + "balance_loss_clip": 1.03122616, + "balance_loss_mlp": 1.00694668, + "epoch": 0.26778091745325555, + "flos": 27852675143040.0, + "grad_norm": 1.8363603781621858, + "language_loss": 0.77777821, + "learning_rate": 3.4358025805258455e-06, + "loss": 0.79990596, + "num_input_tokens_seen": 47808275, + "step": 2227, + "time_per_iteration": 2.831725835800171 + }, + { + "auxiliary_loss_clip": 0.01114539, + "auxiliary_loss_mlp": 0.01087042, + "balance_loss_clip": 1.03255665, + "balance_loss_mlp": 1.00540781, + "epoch": 0.26790116034389466, + "flos": 20955995176320.0, + "grad_norm": 2.275189466309079, + "language_loss": 0.83446527, + "learning_rate": 3.435260194486232e-06, + "loss": 0.85648102, + "num_input_tokens_seen": 47826245, + "step": 2228, + "time_per_iteration": 4.217801094055176 + }, + { + "auxiliary_loss_clip": 0.0113326, + "auxiliary_loss_mlp": 0.01087604, + "balance_loss_clip": 1.03400099, + "balance_loss_mlp": 1.00601709, + "epoch": 0.2680214032345338, + "flos": 18040659621120.0, + "grad_norm": 2.329301516502918, + "language_loss": 0.81934559, + "learning_rate": 3.4347175907187875e-06, + "loss": 0.84155416, + "num_input_tokens_seen": 47843235, + "step": 2229, + "time_per_iteration": 2.698594570159912 + }, + { + "auxiliary_loss_clip": 0.01143283, + "auxiliary_loss_mlp": 0.01090473, + "balance_loss_clip": 1.03480685, + "balance_loss_mlp": 1.00898218, + "epoch": 0.26814164612517283, + "flos": 22419427086720.0, + "grad_norm": 1.9487183818289069, + "language_loss": 0.87744015, + "learning_rate": 3.4341747693058254e-06, + "loss": 0.89977771, + "num_input_tokens_seen": 47861710, + "step": 2230, + "time_per_iteration": 3.7189571857452393 + }, + { + "auxiliary_loss_clip": 0.01072507, + "auxiliary_loss_mlp": 0.01089163, + "balance_loss_clip": 1.02808094, + "balance_loss_mlp": 1.00757575, + "epoch": 0.26826188901581194, + "flos": 35627371159680.0, + "grad_norm": 1.6510242365139682, + "language_loss": 0.7722553, + "learning_rate": 3.4336317303296916e-06, + "loss": 0.793872, + "num_input_tokens_seen": 47882685, + "step": 2231, + "time_per_iteration": 3.080599308013916 + }, + { + "auxiliary_loss_clip": 0.01140661, + "auxiliary_loss_mlp": 0.01088896, + "balance_loss_clip": 1.03280663, + "balance_loss_mlp": 1.00740492, + "epoch": 0.26838213190645105, + "flos": 17639788861440.0, + "grad_norm": 6.604630873821709, + "language_loss": 0.75082016, + "learning_rate": 3.4330884738727635e-06, + "loss": 0.77311569, + "num_input_tokens_seen": 47900860, + "step": 2232, + "time_per_iteration": 2.6804628372192383 + }, + { + "auxiliary_loss_clip": 0.01112299, + "auxiliary_loss_mlp": 0.01089755, + "balance_loss_clip": 1.02997959, + "balance_loss_mlp": 1.00802481, + "epoch": 0.2685023747970901, + "flos": 22674823764480.0, + "grad_norm": 1.7802537927916158, + "language_loss": 0.70769978, + "learning_rate": 3.4325450000174535e-06, + "loss": 0.72972029, + "num_input_tokens_seen": 47917500, + "step": 2233, + "time_per_iteration": 3.699042797088623 + }, + { + "auxiliary_loss_clip": 0.01109749, + "auxiliary_loss_mlp": 0.01088364, + "balance_loss_clip": 1.02835417, + "balance_loss_mlp": 1.00672925, + "epoch": 0.2686226176877292, + "flos": 20120533764480.0, + "grad_norm": 2.15735185235435, + "language_loss": 0.74198389, + "learning_rate": 3.4320013088462067e-06, + "loss": 0.76396501, + "num_input_tokens_seen": 47934860, + "step": 2234, + "time_per_iteration": 2.816497325897217 + }, + { + "auxiliary_loss_clip": 0.01115802, + "auxiliary_loss_mlp": 0.01087222, + "balance_loss_clip": 1.03401971, + "balance_loss_mlp": 1.00573111, + "epoch": 0.2687428605783683, + "flos": 21872040750720.0, + "grad_norm": 1.4379398452401386, + "language_loss": 0.8162275, + "learning_rate": 3.431457400441499e-06, + "loss": 0.83825779, + "num_input_tokens_seen": 47955255, + "step": 2235, + "time_per_iteration": 2.720768451690674 + }, + { + "auxiliary_loss_clip": 0.01083784, + "auxiliary_loss_mlp": 0.01080881, + "balance_loss_clip": 1.0329951, + "balance_loss_mlp": 1.00115395, + "epoch": 0.2688631034690074, + "flos": 69943320766080.0, + "grad_norm": 0.9083961515353647, + "language_loss": 0.60946327, + "learning_rate": 3.4309132748858424e-06, + "loss": 0.63110995, + "num_input_tokens_seen": 48016245, + "step": 2236, + "time_per_iteration": 3.4751923084259033 + }, + { + "auxiliary_loss_clip": 0.01143968, + "auxiliary_loss_mlp": 0.01088884, + "balance_loss_clip": 1.03613043, + "balance_loss_mlp": 1.00739264, + "epoch": 0.2689833463596465, + "flos": 22856639431680.0, + "grad_norm": 1.562985281651521, + "language_loss": 0.83891499, + "learning_rate": 3.430368932261779e-06, + "loss": 0.86124355, + "num_input_tokens_seen": 48036600, + "step": 2237, + "time_per_iteration": 2.6946351528167725 + }, + { + "auxiliary_loss_clip": 0.01130893, + "auxiliary_loss_mlp": 0.01087985, + "balance_loss_clip": 1.03277993, + "balance_loss_mlp": 1.00639772, + "epoch": 0.2691035892502856, + "flos": 17200242132480.0, + "grad_norm": 1.7666977466670708, + "language_loss": 0.74785829, + "learning_rate": 3.429824372651886e-06, + "loss": 0.77004707, + "num_input_tokens_seen": 48054750, + "step": 2238, + "time_per_iteration": 2.924698829650879 + }, + { + "auxiliary_loss_clip": 0.01115347, + "auxiliary_loss_mlp": 0.01088998, + "balance_loss_clip": 1.0319792, + "balance_loss_mlp": 1.0073638, + "epoch": 0.26922383214092466, + "flos": 17747484814080.0, + "grad_norm": 2.2623443026645456, + "language_loss": 0.83378029, + "learning_rate": 3.4292795961387732e-06, + "loss": 0.85582376, + "num_input_tokens_seen": 48072650, + "step": 2239, + "time_per_iteration": 2.899726629257202 + }, + { + "auxiliary_loss_clip": 0.01152925, + "auxiliary_loss_mlp": 0.01088952, + "balance_loss_clip": 1.03553569, + "balance_loss_mlp": 1.00741315, + "epoch": 0.26934407503156377, + "flos": 16173376122240.0, + "grad_norm": 4.408135034042859, + "language_loss": 0.87595618, + "learning_rate": 3.4287346028050818e-06, + "loss": 0.89837497, + "num_input_tokens_seen": 48088720, + "step": 2240, + "time_per_iteration": 2.6592350006103516 + }, + { + "auxiliary_loss_clip": 0.01120728, + "auxiliary_loss_mlp": 0.01086767, + "balance_loss_clip": 1.03347969, + "balance_loss_mlp": 1.00541878, + "epoch": 0.2694643179222028, + "flos": 23732895715200.0, + "grad_norm": 1.5494517414206836, + "language_loss": 0.79813874, + "learning_rate": 3.4281893927334866e-06, + "loss": 0.82021368, + "num_input_tokens_seen": 48108630, + "step": 2241, + "time_per_iteration": 2.9777510166168213 + }, + { + "auxiliary_loss_clip": 0.0114491, + "auxiliary_loss_mlp": 0.01088732, + "balance_loss_clip": 1.03621078, + "balance_loss_mlp": 1.0071454, + "epoch": 0.26958456081284193, + "flos": 24718140840960.0, + "grad_norm": 2.04542705145231, + "language_loss": 0.75329655, + "learning_rate": 3.4276439660066963e-06, + "loss": 0.77563298, + "num_input_tokens_seen": 48128330, + "step": 2242, + "time_per_iteration": 2.7382123470306396 + }, + { + "auxiliary_loss_clip": 0.0115328, + "auxiliary_loss_mlp": 0.01088233, + "balance_loss_clip": 1.03589618, + "balance_loss_mlp": 1.00659871, + "epoch": 0.26970480370348104, + "flos": 18112588606080.0, + "grad_norm": 1.8881761455061785, + "language_loss": 0.84094071, + "learning_rate": 3.427098322707452e-06, + "loss": 0.86335588, + "num_input_tokens_seen": 48144295, + "step": 2243, + "time_per_iteration": 2.8658721446990967 + }, + { + "auxiliary_loss_clip": 0.01139265, + "auxiliary_loss_mlp": 0.01089715, + "balance_loss_clip": 1.03310871, + "balance_loss_mlp": 1.00808001, + "epoch": 0.2698250465941201, + "flos": 10816546250880.0, + "grad_norm": 1.9646374180525152, + "language_loss": 0.89576042, + "learning_rate": 3.426552462918526e-06, + "loss": 0.91805017, + "num_input_tokens_seen": 48162230, + "step": 2244, + "time_per_iteration": 2.6817097663879395 + }, + { + "auxiliary_loss_clip": 0.01154867, + "auxiliary_loss_mlp": 0.01090988, + "balance_loss_clip": 1.03804278, + "balance_loss_mlp": 1.00949621, + "epoch": 0.2699452894847592, + "flos": 17308117653120.0, + "grad_norm": 2.6823637874004933, + "language_loss": 0.72994137, + "learning_rate": 3.426006386722726e-06, + "loss": 0.75239992, + "num_input_tokens_seen": 48180290, + "step": 2245, + "time_per_iteration": 2.77016544342041 + }, + { + "auxiliary_loss_clip": 0.0111766, + "auxiliary_loss_mlp": 0.01090863, + "balance_loss_clip": 1.03292239, + "balance_loss_mlp": 1.00937152, + "epoch": 0.2700655323753983, + "flos": 18078150441600.0, + "grad_norm": 1.9589013482468942, + "language_loss": 0.92154837, + "learning_rate": 3.4254600942028914e-06, + "loss": 0.94363356, + "num_input_tokens_seen": 48198165, + "step": 2246, + "time_per_iteration": 2.8639612197875977 + }, + { + "auxiliary_loss_clip": 0.01130634, + "auxiliary_loss_mlp": 0.01088031, + "balance_loss_clip": 1.03263915, + "balance_loss_mlp": 1.00653994, + "epoch": 0.2701857752660374, + "flos": 18186636493440.0, + "grad_norm": 2.1391552013135025, + "language_loss": 0.8286972, + "learning_rate": 3.424913585441893e-06, + "loss": 0.85088384, + "num_input_tokens_seen": 48216000, + "step": 2247, + "time_per_iteration": 2.7767341136932373 + }, + { + "auxiliary_loss_clip": 0.01134402, + "auxiliary_loss_mlp": 0.01089002, + "balance_loss_clip": 1.03273988, + "balance_loss_mlp": 1.00741565, + "epoch": 0.2703060181566765, + "flos": 16319496648960.0, + "grad_norm": 1.8376732667592977, + "language_loss": 0.87440121, + "learning_rate": 3.4243668605226374e-06, + "loss": 0.89663517, + "num_input_tokens_seen": 48233025, + "step": 2248, + "time_per_iteration": 2.7478082180023193 + }, + { + "auxiliary_loss_clip": 0.01112782, + "auxiliary_loss_mlp": 0.01090017, + "balance_loss_clip": 1.02882361, + "balance_loss_mlp": 1.00819206, + "epoch": 0.2704262610473156, + "flos": 19572357329280.0, + "grad_norm": 2.3808821350617673, + "language_loss": 0.82978201, + "learning_rate": 3.423819919528061e-06, + "loss": 0.85181004, + "num_input_tokens_seen": 48251110, + "step": 2249, + "time_per_iteration": 2.791957378387451 + }, + { + "auxiliary_loss_clip": 0.01115398, + "auxiliary_loss_mlp": 0.01088668, + "balance_loss_clip": 1.03207767, + "balance_loss_mlp": 1.00708103, + "epoch": 0.27054650393795465, + "flos": 20740746925440.0, + "grad_norm": 1.600020830519585, + "language_loss": 0.78717661, + "learning_rate": 3.4232727625411355e-06, + "loss": 0.80921721, + "num_input_tokens_seen": 48270215, + "step": 2250, + "time_per_iteration": 3.743931531906128 + }, + { + "auxiliary_loss_clip": 0.01098099, + "auxiliary_loss_mlp": 0.01086552, + "balance_loss_clip": 1.02793312, + "balance_loss_mlp": 1.00501251, + "epoch": 0.27066674682859376, + "flos": 18658322916480.0, + "grad_norm": 1.7282311171783025, + "language_loss": 0.86401725, + "learning_rate": 3.4227253896448626e-06, + "loss": 0.88586372, + "num_input_tokens_seen": 48288075, + "step": 2251, + "time_per_iteration": 2.842029094696045 + }, + { + "auxiliary_loss_clip": 0.01151471, + "auxiliary_loss_mlp": 0.01088437, + "balance_loss_clip": 1.03460836, + "balance_loss_mlp": 1.00694525, + "epoch": 0.2707869897192329, + "flos": 23002759958400.0, + "grad_norm": 2.2467970212938253, + "language_loss": 0.82298642, + "learning_rate": 3.42217780092228e-06, + "loss": 0.84538555, + "num_input_tokens_seen": 48306415, + "step": 2252, + "time_per_iteration": 2.7250282764434814 + }, + { + "auxiliary_loss_clip": 0.01117295, + "auxiliary_loss_mlp": 0.01080151, + "balance_loss_clip": 1.03871584, + "balance_loss_mlp": 1.00042403, + "epoch": 0.27090723260987193, + "flos": 58323240293760.0, + "grad_norm": 0.7891146225317397, + "language_loss": 0.60387951, + "learning_rate": 3.421629996456456e-06, + "loss": 0.62585396, + "num_input_tokens_seen": 48365035, + "step": 2253, + "time_per_iteration": 3.3078603744506836 + }, + { + "auxiliary_loss_clip": 0.01145851, + "auxiliary_loss_mlp": 0.01089442, + "balance_loss_clip": 1.03649235, + "balance_loss_mlp": 1.0079025, + "epoch": 0.27102747550051104, + "flos": 11984540797440.0, + "grad_norm": 1.786133323378451, + "language_loss": 0.82173121, + "learning_rate": 3.421081976330491e-06, + "loss": 0.84408414, + "num_input_tokens_seen": 48383550, + "step": 2254, + "time_per_iteration": 3.6869802474975586 + }, + { + "auxiliary_loss_clip": 0.0113388, + "auxiliary_loss_mlp": 0.01089157, + "balance_loss_clip": 1.03358865, + "balance_loss_mlp": 1.00771344, + "epoch": 0.27114771839115015, + "flos": 19900401264000.0, + "grad_norm": 1.7456404905920735, + "language_loss": 0.87695611, + "learning_rate": 3.4205337406275207e-06, + "loss": 0.89918643, + "num_input_tokens_seen": 48403670, + "step": 2255, + "time_per_iteration": 3.675827741622925 + }, + { + "auxiliary_loss_clip": 0.01151302, + "auxiliary_loss_mlp": 0.01085854, + "balance_loss_clip": 1.03424084, + "balance_loss_mlp": 1.00441003, + "epoch": 0.2712679612817892, + "flos": 18331966920960.0, + "grad_norm": 2.198229546703778, + "language_loss": 0.75660002, + "learning_rate": 3.4199852894307114e-06, + "loss": 0.77897155, + "num_input_tokens_seen": 48420420, + "step": 2256, + "time_per_iteration": 2.7329556941986084 + }, + { + "auxiliary_loss_clip": 0.01095786, + "auxiliary_loss_mlp": 0.01089413, + "balance_loss_clip": 1.0246644, + "balance_loss_mlp": 1.0079211, + "epoch": 0.2713882041724283, + "flos": 24460302038400.0, + "grad_norm": 1.822519500378922, + "language_loss": 0.78831404, + "learning_rate": 3.419436622823262e-06, + "loss": 0.810166, + "num_input_tokens_seen": 48441140, + "step": 2257, + "time_per_iteration": 2.852618932723999 + }, + { + "auxiliary_loss_clip": 0.01135556, + "auxiliary_loss_mlp": 0.01086922, + "balance_loss_clip": 1.03485274, + "balance_loss_mlp": 1.00557339, + "epoch": 0.27150844706306737, + "flos": 23039317025280.0, + "grad_norm": 1.800244868901789, + "language_loss": 0.74198872, + "learning_rate": 3.4188877408884063e-06, + "loss": 0.7642135, + "num_input_tokens_seen": 48461845, + "step": 2258, + "time_per_iteration": 2.8395819664001465 + }, + { + "auxiliary_loss_clip": 0.01133824, + "auxiliary_loss_mlp": 0.0108994, + "balance_loss_clip": 1.03370118, + "balance_loss_mlp": 1.00849664, + "epoch": 0.2716286899537065, + "flos": 22563644192640.0, + "grad_norm": 2.345107426902253, + "language_loss": 0.6534431, + "learning_rate": 3.4183386437094088e-06, + "loss": 0.67568082, + "num_input_tokens_seen": 48478510, + "step": 2259, + "time_per_iteration": 3.8042776584625244 + }, + { + "auxiliary_loss_clip": 0.01131783, + "auxiliary_loss_mlp": 0.01087909, + "balance_loss_clip": 1.03265548, + "balance_loss_mlp": 1.006513, + "epoch": 0.2717489328443456, + "flos": 13115044523520.0, + "grad_norm": 2.072861826580927, + "language_loss": 0.82322824, + "learning_rate": 3.417789331369565e-06, + "loss": 0.84542519, + "num_input_tokens_seen": 48494300, + "step": 2260, + "time_per_iteration": 2.7458395957946777 + }, + { + "auxiliary_loss_clip": 0.01153477, + "auxiliary_loss_mlp": 0.01091565, + "balance_loss_clip": 1.03612161, + "balance_loss_mlp": 1.01002586, + "epoch": 0.27186917573498465, + "flos": 29278688060160.0, + "grad_norm": 1.8343661536936404, + "language_loss": 0.90918881, + "learning_rate": 3.4172398039522088e-06, + "loss": 0.93163919, + "num_input_tokens_seen": 48515585, + "step": 2261, + "time_per_iteration": 2.814025640487671 + }, + { + "auxiliary_loss_clip": 0.01145123, + "auxiliary_loss_mlp": 0.01089773, + "balance_loss_clip": 1.0345242, + "balance_loss_mlp": 1.00813842, + "epoch": 0.27198941862562376, + "flos": 26032220000640.0, + "grad_norm": 2.0330044329545545, + "language_loss": 0.79554909, + "learning_rate": 3.4166900615407e-06, + "loss": 0.81789804, + "num_input_tokens_seen": 48533500, + "step": 2262, + "time_per_iteration": 2.7405314445495605 + }, + { + "auxiliary_loss_clip": 0.01139768, + "auxiliary_loss_mlp": 0.01088682, + "balance_loss_clip": 1.03207242, + "balance_loss_mlp": 1.00723791, + "epoch": 0.27210966151626287, + "flos": 32780983760640.0, + "grad_norm": 1.996422184110761, + "language_loss": 0.75028902, + "learning_rate": 3.416140104218436e-06, + "loss": 0.77257353, + "num_input_tokens_seen": 48552865, + "step": 2263, + "time_per_iteration": 2.9083476066589355 + }, + { + "auxiliary_loss_clip": 0.01123652, + "auxiliary_loss_mlp": 0.00873224, + "balance_loss_clip": 1.03787804, + "balance_loss_mlp": 1.0004065, + "epoch": 0.2722299044069019, + "flos": 65471043219840.0, + "grad_norm": 0.8426645940762463, + "language_loss": 0.69745791, + "learning_rate": 3.4155899320688437e-06, + "loss": 0.71742666, + "num_input_tokens_seen": 48618940, + "step": 2264, + "time_per_iteration": 3.3302316665649414 + }, + { + "auxiliary_loss_clip": 0.01098089, + "auxiliary_loss_mlp": 0.01087721, + "balance_loss_clip": 1.0275327, + "balance_loss_mlp": 1.00637305, + "epoch": 0.27235014729754103, + "flos": 15334143782400.0, + "grad_norm": 3.6511769119498974, + "language_loss": 0.74203789, + "learning_rate": 3.415039545175384e-06, + "loss": 0.76389605, + "num_input_tokens_seen": 48634665, + "step": 2265, + "time_per_iteration": 2.8987233638763428 + }, + { + "auxiliary_loss_clip": 0.01141611, + "auxiliary_loss_mlp": 0.01089649, + "balance_loss_clip": 1.03246105, + "balance_loss_mlp": 1.00811028, + "epoch": 0.27247039018818014, + "flos": 21872363973120.0, + "grad_norm": 2.2203443099990574, + "language_loss": 0.64936519, + "learning_rate": 3.414488943621551e-06, + "loss": 0.67167783, + "num_input_tokens_seen": 48653330, + "step": 2266, + "time_per_iteration": 2.789644956588745 + }, + { + "auxiliary_loss_clip": 0.01135598, + "auxiliary_loss_mlp": 0.01089767, + "balance_loss_clip": 1.02982104, + "balance_loss_mlp": 1.00827599, + "epoch": 0.2725906330788192, + "flos": 18695490514560.0, + "grad_norm": 1.782885028276444, + "language_loss": 0.74010181, + "learning_rate": 3.41393812749087e-06, + "loss": 0.76235545, + "num_input_tokens_seen": 48671375, + "step": 2267, + "time_per_iteration": 2.6976845264434814 + }, + { + "auxiliary_loss_clip": 0.01128622, + "auxiliary_loss_mlp": 0.01088594, + "balance_loss_clip": 1.03165412, + "balance_loss_mlp": 1.00715041, + "epoch": 0.2727108759694583, + "flos": 17886099398400.0, + "grad_norm": 2.5850266896270684, + "language_loss": 0.71786082, + "learning_rate": 3.4133870968668984e-06, + "loss": 0.74003303, + "num_input_tokens_seen": 48686175, + "step": 2268, + "time_per_iteration": 2.7479169368743896 + }, + { + "auxiliary_loss_clip": 0.01133736, + "auxiliary_loss_mlp": 0.01087745, + "balance_loss_clip": 1.03312504, + "balance_loss_mlp": 1.00625336, + "epoch": 0.2728311188600974, + "flos": 24461666755200.0, + "grad_norm": 2.120079479132532, + "language_loss": 0.78690946, + "learning_rate": 3.412835851833229e-06, + "loss": 0.80912429, + "num_input_tokens_seen": 48708370, + "step": 2269, + "time_per_iteration": 2.811030387878418 + }, + { + "auxiliary_loss_clip": 0.01135819, + "auxiliary_loss_mlp": 0.0109034, + "balance_loss_clip": 1.03044939, + "balance_loss_mlp": 1.00875282, + "epoch": 0.2729513617507365, + "flos": 30993314757120.0, + "grad_norm": 1.693555924177486, + "language_loss": 0.77882636, + "learning_rate": 3.4122843924734834e-06, + "loss": 0.80108798, + "num_input_tokens_seen": 48730670, + "step": 2270, + "time_per_iteration": 2.8762083053588867 + }, + { + "auxiliary_loss_clip": 0.0113351, + "auxiliary_loss_mlp": 0.01087867, + "balance_loss_clip": 1.03382874, + "balance_loss_mlp": 1.00623298, + "epoch": 0.2730716046413756, + "flos": 19094637421440.0, + "grad_norm": 3.211408099556637, + "language_loss": 0.88053, + "learning_rate": 3.411732718871319e-06, + "loss": 0.90274382, + "num_input_tokens_seen": 48746510, + "step": 2271, + "time_per_iteration": 2.7304210662841797 + }, + { + "auxiliary_loss_clip": 0.01153202, + "auxiliary_loss_mlp": 0.0108998, + "balance_loss_clip": 1.0367589, + "balance_loss_mlp": 1.00863111, + "epoch": 0.27319184753201464, + "flos": 26944566474240.0, + "grad_norm": 1.4955920045442548, + "language_loss": 0.78507864, + "learning_rate": 3.4111808311104227e-06, + "loss": 0.8075105, + "num_input_tokens_seen": 48768825, + "step": 2272, + "time_per_iteration": 2.719200849533081 + }, + { + "auxiliary_loss_clip": 0.01134658, + "auxiliary_loss_mlp": 0.01087208, + "balance_loss_clip": 1.03328025, + "balance_loss_mlp": 1.00562119, + "epoch": 0.27331209042265375, + "flos": 31759828012800.0, + "grad_norm": 1.6314074170625634, + "language_loss": 0.69338167, + "learning_rate": 3.410628729274517e-06, + "loss": 0.71560037, + "num_input_tokens_seen": 48790345, + "step": 2273, + "time_per_iteration": 2.8443052768707275 + }, + { + "auxiliary_loss_clip": 0.01128001, + "auxiliary_loss_mlp": 0.00873896, + "balance_loss_clip": 1.02984262, + "balance_loss_mlp": 1.00011981, + "epoch": 0.27343233331329286, + "flos": 25739081107200.0, + "grad_norm": 1.910376304761999, + "language_loss": 0.82305276, + "learning_rate": 3.4100764134473546e-06, + "loss": 0.84307182, + "num_input_tokens_seen": 48809630, + "step": 2274, + "time_per_iteration": 2.7920567989349365 + }, + { + "auxiliary_loss_clip": 0.01151744, + "auxiliary_loss_mlp": 0.0108964, + "balance_loss_clip": 1.03572297, + "balance_loss_mlp": 1.00824428, + "epoch": 0.2735525762039319, + "flos": 24389414547840.0, + "grad_norm": 2.426922162674599, + "language_loss": 0.85013825, + "learning_rate": 3.4095238837127215e-06, + "loss": 0.87255216, + "num_input_tokens_seen": 48828770, + "step": 2275, + "time_per_iteration": 2.7730767726898193 + }, + { + "auxiliary_loss_clip": 0.01119896, + "auxiliary_loss_mlp": 0.01089554, + "balance_loss_clip": 1.0296545, + "balance_loss_mlp": 1.00810981, + "epoch": 0.27367281909457103, + "flos": 14465357527680.0, + "grad_norm": 1.8525800690882976, + "language_loss": 0.79122961, + "learning_rate": 3.4089711401544355e-06, + "loss": 0.81332415, + "num_input_tokens_seen": 48846365, + "step": 2276, + "time_per_iteration": 3.7282445430755615 + }, + { + "auxiliary_loss_clip": 0.01142641, + "auxiliary_loss_mlp": 0.01090689, + "balance_loss_clip": 1.03381526, + "balance_loss_mlp": 1.00914979, + "epoch": 0.27379306198521014, + "flos": 23476996247040.0, + "grad_norm": 2.222393194851614, + "language_loss": 0.67722321, + "learning_rate": 3.4084181828563486e-06, + "loss": 0.69955647, + "num_input_tokens_seen": 48863085, + "step": 2277, + "time_per_iteration": 2.7510123252868652 + }, + { + "auxiliary_loss_clip": 0.01111804, + "auxiliary_loss_mlp": 0.01088337, + "balance_loss_clip": 1.02955031, + "balance_loss_mlp": 1.00698841, + "epoch": 0.2739133048758492, + "flos": 17458152762240.0, + "grad_norm": 1.5190816339381525, + "language_loss": 0.70585877, + "learning_rate": 3.4078650119023428e-06, + "loss": 0.72786015, + "num_input_tokens_seen": 48881400, + "step": 2278, + "time_per_iteration": 2.7830846309661865 + }, + { + "auxiliary_loss_clip": 0.01098034, + "auxiliary_loss_mlp": 0.01089191, + "balance_loss_clip": 1.03203106, + "balance_loss_mlp": 1.00760436, + "epoch": 0.2740335477664883, + "flos": 19273113123840.0, + "grad_norm": 2.0538417733284473, + "language_loss": 0.74042541, + "learning_rate": 3.4073116273763337e-06, + "loss": 0.76229769, + "num_input_tokens_seen": 48895845, + "step": 2279, + "time_per_iteration": 3.884397506713867 + }, + { + "auxiliary_loss_clip": 0.01130659, + "auxiliary_loss_mlp": 0.01088894, + "balance_loss_clip": 1.03084242, + "balance_loss_mlp": 1.00721133, + "epoch": 0.2741537906571274, + "flos": 26104723603200.0, + "grad_norm": 1.749043678614107, + "language_loss": 0.81519091, + "learning_rate": 3.40675802936227e-06, + "loss": 0.83738649, + "num_input_tokens_seen": 48916630, + "step": 2280, + "time_per_iteration": 3.679415464401245 + }, + { + "auxiliary_loss_clip": 0.01132441, + "auxiliary_loss_mlp": 0.01089147, + "balance_loss_clip": 1.03260505, + "balance_loss_mlp": 1.00770307, + "epoch": 0.27427403354776647, + "flos": 34164190644480.0, + "grad_norm": 1.8373781728538268, + "language_loss": 0.71703398, + "learning_rate": 3.4062042179441318e-06, + "loss": 0.73924977, + "num_input_tokens_seen": 48937100, + "step": 2281, + "time_per_iteration": 2.905673027038574 + }, + { + "auxiliary_loss_clip": 0.01140445, + "auxiliary_loss_mlp": 0.01089187, + "balance_loss_clip": 1.03352737, + "balance_loss_mlp": 1.00788665, + "epoch": 0.2743942764384056, + "flos": 18766988536320.0, + "grad_norm": 1.9932853302993525, + "language_loss": 0.80569446, + "learning_rate": 3.4056501932059314e-06, + "loss": 0.82799077, + "num_input_tokens_seen": 48955175, + "step": 2282, + "time_per_iteration": 2.6864264011383057 + }, + { + "auxiliary_loss_clip": 0.0114335, + "auxiliary_loss_mlp": 0.01079209, + "balance_loss_clip": 1.04149771, + "balance_loss_mlp": 0.99986374, + "epoch": 0.2745145193290447, + "flos": 64904048058240.0, + "grad_norm": 0.7662871449599695, + "language_loss": 0.58081472, + "learning_rate": 3.405095955231715e-06, + "loss": 0.60304028, + "num_input_tokens_seen": 49006830, + "step": 2283, + "time_per_iteration": 4.107445240020752 + }, + { + "auxiliary_loss_clip": 0.01140996, + "auxiliary_loss_mlp": 0.01087394, + "balance_loss_clip": 1.03207469, + "balance_loss_mlp": 1.00599766, + "epoch": 0.27463476221968375, + "flos": 16136926796160.0, + "grad_norm": 2.259231392312622, + "language_loss": 0.9446888, + "learning_rate": 3.4045415041055585e-06, + "loss": 0.96697271, + "num_input_tokens_seen": 49022470, + "step": 2284, + "time_per_iteration": 2.675840139389038 + }, + { + "auxiliary_loss_clip": 0.01130197, + "auxiliary_loss_mlp": 0.01088108, + "balance_loss_clip": 1.03210104, + "balance_loss_mlp": 1.00656855, + "epoch": 0.27475500511032286, + "flos": 10376712213120.0, + "grad_norm": 2.3376033363913216, + "language_loss": 0.7798084, + "learning_rate": 3.4039868399115728e-06, + "loss": 0.8019914, + "num_input_tokens_seen": 49037110, + "step": 2285, + "time_per_iteration": 2.775907039642334 + }, + { + "auxiliary_loss_clip": 0.01102741, + "auxiliary_loss_mlp": 0.01088382, + "balance_loss_clip": 1.03077769, + "balance_loss_mlp": 1.00689054, + "epoch": 0.27487524800096197, + "flos": 17311062568320.0, + "grad_norm": 1.7333057707232533, + "language_loss": 0.80488026, + "learning_rate": 3.4034319627339003e-06, + "loss": 0.82679141, + "num_input_tokens_seen": 49053975, + "step": 2286, + "time_per_iteration": 2.8927953243255615 + }, + { + "auxiliary_loss_clip": 0.01131731, + "auxiliary_loss_mlp": 0.01087953, + "balance_loss_clip": 1.03292978, + "balance_loss_mlp": 1.00641394, + "epoch": 0.274995490891601, + "flos": 27120205002240.0, + "grad_norm": 2.5247916056710666, + "language_loss": 0.69399047, + "learning_rate": 3.402876872656715e-06, + "loss": 0.71618724, + "num_input_tokens_seen": 49072295, + "step": 2287, + "time_per_iteration": 2.7634341716766357 + }, + { + "auxiliary_loss_clip": 0.01128939, + "auxiliary_loss_mlp": 0.01087923, + "balance_loss_clip": 1.03035355, + "balance_loss_mlp": 1.00662184, + "epoch": 0.27511573378224013, + "flos": 23436093634560.0, + "grad_norm": 1.9880890369545086, + "language_loss": 0.89378619, + "learning_rate": 3.402321569764223e-06, + "loss": 0.91595471, + "num_input_tokens_seen": 49091600, + "step": 2288, + "time_per_iteration": 2.821626663208008 + }, + { + "auxiliary_loss_clip": 0.01109965, + "auxiliary_loss_mlp": 0.00873879, + "balance_loss_clip": 1.02869153, + "balance_loss_mlp": 1.0002501, + "epoch": 0.2752359766728792, + "flos": 16722019434240.0, + "grad_norm": 1.926820728972587, + "language_loss": 0.83517015, + "learning_rate": 3.4017660541406635e-06, + "loss": 0.85500854, + "num_input_tokens_seen": 49107665, + "step": 2289, + "time_per_iteration": 2.8055546283721924 + }, + { + "auxiliary_loss_clip": 0.01123839, + "auxiliary_loss_mlp": 0.01088164, + "balance_loss_clip": 1.03419745, + "balance_loss_mlp": 1.00657725, + "epoch": 0.2753562195635183, + "flos": 25297738698240.0, + "grad_norm": 1.6517167446326677, + "language_loss": 0.74369299, + "learning_rate": 3.4012103258703092e-06, + "loss": 0.76581299, + "num_input_tokens_seen": 49126420, + "step": 2290, + "time_per_iteration": 2.8012659549713135 + }, + { + "auxiliary_loss_clip": 0.01123243, + "auxiliary_loss_mlp": 0.01087808, + "balance_loss_clip": 1.0319252, + "balance_loss_mlp": 1.00636446, + "epoch": 0.2754764624541574, + "flos": 27338972785920.0, + "grad_norm": 1.8878317539837326, + "language_loss": 0.83008742, + "learning_rate": 3.4006543850374616e-06, + "loss": 0.852198, + "num_input_tokens_seen": 49141470, + "step": 2291, + "time_per_iteration": 2.903536319732666 + }, + { + "auxiliary_loss_clip": 0.0114367, + "auxiliary_loss_mlp": 0.01090043, + "balance_loss_clip": 1.03487551, + "balance_loss_mlp": 1.00850427, + "epoch": 0.27559670534479647, + "flos": 17238379397760.0, + "grad_norm": 1.7739258400409716, + "language_loss": 0.74727714, + "learning_rate": 3.400098231726458e-06, + "loss": 0.76961434, + "num_input_tokens_seen": 49158570, + "step": 2292, + "time_per_iteration": 2.715038537979126 + }, + { + "auxiliary_loss_clip": 0.01122784, + "auxiliary_loss_mlp": 0.01088383, + "balance_loss_clip": 1.0310148, + "balance_loss_mlp": 1.00674844, + "epoch": 0.2757169482354356, + "flos": 21939085486080.0, + "grad_norm": 1.8721174632973074, + "language_loss": 0.86633122, + "learning_rate": 3.3995418660216657e-06, + "loss": 0.88844287, + "num_input_tokens_seen": 49176025, + "step": 2293, + "time_per_iteration": 2.7958462238311768 + }, + { + "auxiliary_loss_clip": 0.0115251, + "auxiliary_loss_mlp": 0.01089117, + "balance_loss_clip": 1.03509855, + "balance_loss_mlp": 1.00743496, + "epoch": 0.2758371911260747, + "flos": 20850669521280.0, + "grad_norm": 2.3690471579112753, + "language_loss": 0.80786586, + "learning_rate": 3.3989852880074848e-06, + "loss": 0.83028215, + "num_input_tokens_seen": 49197455, + "step": 2294, + "time_per_iteration": 2.9282426834106445 + }, + { + "auxiliary_loss_clip": 0.01115235, + "auxiliary_loss_mlp": 0.01080175, + "balance_loss_clip": 1.0305934, + "balance_loss_mlp": 1.00044751, + "epoch": 0.27595743401671374, + "flos": 69269063592960.0, + "grad_norm": 0.8027253661006465, + "language_loss": 0.60612065, + "learning_rate": 3.398428497768348e-06, + "loss": 0.62807471, + "num_input_tokens_seen": 49262625, + "step": 2295, + "time_per_iteration": 3.4153919219970703 + }, + { + "auxiliary_loss_clip": 0.01124777, + "auxiliary_loss_mlp": 0.01086705, + "balance_loss_clip": 1.03172159, + "balance_loss_mlp": 1.00526118, + "epoch": 0.27607767690735285, + "flos": 21215019127680.0, + "grad_norm": 1.9396231370860326, + "language_loss": 0.72069579, + "learning_rate": 3.3978714953887205e-06, + "loss": 0.74281061, + "num_input_tokens_seen": 49282380, + "step": 2296, + "time_per_iteration": 2.834796667098999 + }, + { + "auxiliary_loss_clip": 0.011121, + "auxiliary_loss_mlp": 0.01088085, + "balance_loss_clip": 1.0298543, + "balance_loss_mlp": 1.00654554, + "epoch": 0.27619791979799196, + "flos": 24825334003200.0, + "grad_norm": 1.6224748394156359, + "language_loss": 0.85892648, + "learning_rate": 3.397314280953098e-06, + "loss": 0.88092834, + "num_input_tokens_seen": 49303205, + "step": 2297, + "time_per_iteration": 2.8853535652160645 + }, + { + "auxiliary_loss_clip": 0.01133816, + "auxiliary_loss_mlp": 0.01087474, + "balance_loss_clip": 1.03418016, + "balance_loss_mlp": 1.00612593, + "epoch": 0.276318162688631, + "flos": 24753548672640.0, + "grad_norm": 1.880303804844605, + "language_loss": 0.8019281, + "learning_rate": 3.3967568545460108e-06, + "loss": 0.82414103, + "num_input_tokens_seen": 49322745, + "step": 2298, + "time_per_iteration": 2.7674577236175537 + }, + { + "auxiliary_loss_clip": 0.01138057, + "auxiliary_loss_mlp": 0.01088955, + "balance_loss_clip": 1.03122759, + "balance_loss_mlp": 1.00755858, + "epoch": 0.27643840557927013, + "flos": 18150007599360.0, + "grad_norm": 1.757358072636527, + "language_loss": 0.80723888, + "learning_rate": 3.3961992162520185e-06, + "loss": 0.82950902, + "num_input_tokens_seen": 49341370, + "step": 2299, + "time_per_iteration": 2.6606688499450684 + }, + { + "auxiliary_loss_clip": 0.01140465, + "auxiliary_loss_mlp": 0.01087614, + "balance_loss_clip": 1.03261638, + "balance_loss_mlp": 1.00612211, + "epoch": 0.27655864846990924, + "flos": 24823933372800.0, + "grad_norm": 2.053112835998828, + "language_loss": 0.71659362, + "learning_rate": 3.3956413661557156e-06, + "loss": 0.73887444, + "num_input_tokens_seen": 49361545, + "step": 2300, + "time_per_iteration": 2.6934916973114014 + }, + { + "auxiliary_loss_clip": 0.01123936, + "auxiliary_loss_mlp": 0.01089064, + "balance_loss_clip": 1.03163242, + "balance_loss_mlp": 1.00757241, + "epoch": 0.2766788913605483, + "flos": 20266582464000.0, + "grad_norm": 1.942879719647012, + "language_loss": 0.66436994, + "learning_rate": 3.3950833043417273e-06, + "loss": 0.68649995, + "num_input_tokens_seen": 49379690, + "step": 2301, + "time_per_iteration": 3.5283639430999756 + }, + { + "auxiliary_loss_clip": 0.01143557, + "auxiliary_loss_mlp": 0.01088369, + "balance_loss_clip": 1.0358119, + "balance_loss_mlp": 1.00687766, + "epoch": 0.2767991342511874, + "flos": 21470272151040.0, + "grad_norm": 2.418731983468056, + "language_loss": 0.72953421, + "learning_rate": 3.3945250308947105e-06, + "loss": 0.75185347, + "num_input_tokens_seen": 49395995, + "step": 2302, + "time_per_iteration": 2.6454081535339355 + }, + { + "auxiliary_loss_clip": 0.01132452, + "auxiliary_loss_mlp": 0.01079621, + "balance_loss_clip": 1.03885293, + "balance_loss_mlp": 1.00027514, + "epoch": 0.2769193771418265, + "flos": 66002627571840.0, + "grad_norm": 1.2738982098236387, + "language_loss": 0.68411708, + "learning_rate": 3.3939665458993556e-06, + "loss": 0.70623779, + "num_input_tokens_seen": 49450415, + "step": 2303, + "time_per_iteration": 3.2392711639404297 + }, + { + "auxiliary_loss_clip": 0.01122157, + "auxiliary_loss_mlp": 0.01088092, + "balance_loss_clip": 1.03106606, + "balance_loss_mlp": 1.00664854, + "epoch": 0.27703962003246557, + "flos": 20704441253760.0, + "grad_norm": 1.8517496140896006, + "language_loss": 0.76609325, + "learning_rate": 3.3934078494403843e-06, + "loss": 0.78819573, + "num_input_tokens_seen": 49469990, + "step": 2304, + "time_per_iteration": 2.9145169258117676 + }, + { + "auxiliary_loss_clip": 0.01079046, + "auxiliary_loss_mlp": 0.00873911, + "balance_loss_clip": 1.0273068, + "balance_loss_mlp": 1.00022602, + "epoch": 0.2771598629231047, + "flos": 22929897219840.0, + "grad_norm": 1.5784826464974513, + "language_loss": 0.81081855, + "learning_rate": 3.3928489416025495e-06, + "loss": 0.83034819, + "num_input_tokens_seen": 49490835, + "step": 2305, + "time_per_iteration": 3.975808620452881 + }, + { + "auxiliary_loss_clip": 0.01123199, + "auxiliary_loss_mlp": 0.01087627, + "balance_loss_clip": 1.03070819, + "balance_loss_mlp": 1.00604057, + "epoch": 0.27728010581374374, + "flos": 18369457741440.0, + "grad_norm": 1.945421422698535, + "language_loss": 0.79092419, + "learning_rate": 3.392289822470638e-06, + "loss": 0.81303251, + "num_input_tokens_seen": 49508815, + "step": 2306, + "time_per_iteration": 3.9735054969787598 + }, + { + "auxiliary_loss_clip": 0.01129686, + "auxiliary_loss_mlp": 0.01088138, + "balance_loss_clip": 1.03162158, + "balance_loss_mlp": 1.00669408, + "epoch": 0.27740034870438285, + "flos": 19427637432960.0, + "grad_norm": 1.988483107026499, + "language_loss": 0.75905275, + "learning_rate": 3.3917304921294674e-06, + "loss": 0.78123093, + "num_input_tokens_seen": 49526980, + "step": 2307, + "time_per_iteration": 2.7170236110687256 + }, + { + "auxiliary_loss_clip": 0.01140137, + "auxiliary_loss_mlp": 0.01087079, + "balance_loss_clip": 1.03246951, + "balance_loss_mlp": 1.0055871, + "epoch": 0.27752059159502196, + "flos": 21614776565760.0, + "grad_norm": 1.9596052780960416, + "language_loss": 0.80353725, + "learning_rate": 3.3911709506638876e-06, + "loss": 0.82580948, + "num_input_tokens_seen": 49546290, + "step": 2308, + "time_per_iteration": 2.668239116668701 + }, + { + "auxiliary_loss_clip": 0.01124911, + "auxiliary_loss_mlp": 0.00873913, + "balance_loss_clip": 1.03294349, + "balance_loss_mlp": 1.00018144, + "epoch": 0.277640834485661, + "flos": 26608011016320.0, + "grad_norm": 2.30251189505544, + "language_loss": 0.81409746, + "learning_rate": 3.390611198158781e-06, + "loss": 0.8340857, + "num_input_tokens_seen": 49564165, + "step": 2309, + "time_per_iteration": 3.6739306449890137 + }, + { + "auxiliary_loss_clip": 0.01152049, + "auxiliary_loss_mlp": 0.01088277, + "balance_loss_clip": 1.03495049, + "balance_loss_mlp": 1.00683272, + "epoch": 0.2777610773763001, + "flos": 19492814661120.0, + "grad_norm": 2.04271931292426, + "language_loss": 0.90133381, + "learning_rate": 3.3900512346990612e-06, + "loss": 0.92373705, + "num_input_tokens_seen": 49580155, + "step": 2310, + "time_per_iteration": 2.602890968322754 + }, + { + "auxiliary_loss_clip": 0.01114366, + "auxiliary_loss_mlp": 0.01087384, + "balance_loss_clip": 1.0306766, + "balance_loss_mlp": 1.00584435, + "epoch": 0.27788132026693924, + "flos": 38290650001920.0, + "grad_norm": 1.8749282412803907, + "language_loss": 0.65678918, + "learning_rate": 3.389491060369674e-06, + "loss": 0.67880666, + "num_input_tokens_seen": 49605830, + "step": 2311, + "time_per_iteration": 2.9375784397125244 + }, + { + "auxiliary_loss_clip": 0.01112278, + "auxiliary_loss_mlp": 0.01088774, + "balance_loss_clip": 1.03082955, + "balance_loss_mlp": 1.00737739, + "epoch": 0.2780015631575783, + "flos": 22382546797440.0, + "grad_norm": 2.769351476037732, + "language_loss": 0.89270002, + "learning_rate": 3.388930675255598e-06, + "loss": 0.91471052, + "num_input_tokens_seen": 49625680, + "step": 2312, + "time_per_iteration": 2.831550359725952 + }, + { + "auxiliary_loss_clip": 0.01131632, + "auxiliary_loss_mlp": 0.01087578, + "balance_loss_clip": 1.03232694, + "balance_loss_mlp": 1.0059433, + "epoch": 0.2781218060482174, + "flos": 12203200840320.0, + "grad_norm": 2.3902805182557993, + "language_loss": 0.78871268, + "learning_rate": 3.388370079441843e-06, + "loss": 0.8109048, + "num_input_tokens_seen": 49641195, + "step": 2313, + "time_per_iteration": 2.676632881164551 + }, + { + "auxiliary_loss_clip": 0.01120072, + "auxiliary_loss_mlp": 0.01088868, + "balance_loss_clip": 1.03186214, + "balance_loss_mlp": 1.00742412, + "epoch": 0.2782420489388565, + "flos": 18107632529280.0, + "grad_norm": 1.8400744775153728, + "language_loss": 0.92768973, + "learning_rate": 3.3878092730134505e-06, + "loss": 0.94977915, + "num_input_tokens_seen": 49659180, + "step": 2314, + "time_per_iteration": 2.7431023120880127 + }, + { + "auxiliary_loss_clip": 0.01141928, + "auxiliary_loss_mlp": 0.01089916, + "balance_loss_clip": 1.03322005, + "balance_loss_mlp": 1.00828099, + "epoch": 0.27836229182949557, + "flos": 18514752255360.0, + "grad_norm": 1.497059448471289, + "language_loss": 0.80620003, + "learning_rate": 3.3872482560554947e-06, + "loss": 0.82851851, + "num_input_tokens_seen": 49677955, + "step": 2315, + "time_per_iteration": 2.734943389892578 + }, + { + "auxiliary_loss_clip": 0.01133683, + "auxiliary_loss_mlp": 0.01080284, + "balance_loss_clip": 1.0399425, + "balance_loss_mlp": 1.00055635, + "epoch": 0.2784825347201347, + "flos": 67079230940160.0, + "grad_norm": 0.8022980961045472, + "language_loss": 0.56993359, + "learning_rate": 3.386687028653082e-06, + "loss": 0.59207326, + "num_input_tokens_seen": 49740800, + "step": 2316, + "time_per_iteration": 3.2490274906158447 + }, + { + "auxiliary_loss_clip": 0.01109141, + "auxiliary_loss_mlp": 0.01087286, + "balance_loss_clip": 1.02990031, + "balance_loss_mlp": 1.00584233, + "epoch": 0.2786027776107738, + "flos": 22631119891200.0, + "grad_norm": 1.6603298800854553, + "language_loss": 0.85068798, + "learning_rate": 3.386125590891349e-06, + "loss": 0.87265229, + "num_input_tokens_seen": 49757675, + "step": 2317, + "time_per_iteration": 2.853343963623047 + }, + { + "auxiliary_loss_clip": 0.01133951, + "auxiliary_loss_mlp": 0.01088324, + "balance_loss_clip": 1.03359604, + "balance_loss_mlp": 1.00707138, + "epoch": 0.27872302050141284, + "flos": 15778826156160.0, + "grad_norm": 2.122444045810386, + "language_loss": 0.82856363, + "learning_rate": 3.3855639428554657e-06, + "loss": 0.85078633, + "num_input_tokens_seen": 49775205, + "step": 2318, + "time_per_iteration": 2.706667423248291 + }, + { + "auxiliary_loss_clip": 0.01112187, + "auxiliary_loss_mlp": 0.01086601, + "balance_loss_clip": 1.02901053, + "balance_loss_mlp": 1.00515699, + "epoch": 0.27884326339205195, + "flos": 22126970551680.0, + "grad_norm": 1.9756222661418088, + "language_loss": 0.80476904, + "learning_rate": 3.385002084630635e-06, + "loss": 0.82675683, + "num_input_tokens_seen": 49794175, + "step": 2319, + "time_per_iteration": 2.864187479019165 + }, + { + "auxiliary_loss_clip": 0.01143007, + "auxiliary_loss_mlp": 0.01090273, + "balance_loss_clip": 1.03423977, + "balance_loss_mlp": 1.00873435, + "epoch": 0.278963506282691, + "flos": 20558715776640.0, + "grad_norm": 2.0834513407892445, + "language_loss": 0.84695971, + "learning_rate": 3.384440016302088e-06, + "loss": 0.8692925, + "num_input_tokens_seen": 49812850, + "step": 2320, + "time_per_iteration": 2.711179733276367 + }, + { + "auxiliary_loss_clip": 0.01132151, + "auxiliary_loss_mlp": 0.01086591, + "balance_loss_clip": 1.03095615, + "balance_loss_mlp": 1.00509977, + "epoch": 0.2790837491733301, + "flos": 21942928241280.0, + "grad_norm": 2.2848937087518886, + "language_loss": 0.6236639, + "learning_rate": 3.3838777379550923e-06, + "loss": 0.64585125, + "num_input_tokens_seen": 49832295, + "step": 2321, + "time_per_iteration": 2.694218397140503 + }, + { + "auxiliary_loss_clip": 0.01121101, + "auxiliary_loss_mlp": 0.01088448, + "balance_loss_clip": 1.03337836, + "balance_loss_mlp": 1.00710011, + "epoch": 0.27920399206396923, + "flos": 26286790665600.0, + "grad_norm": 1.9389726002998158, + "language_loss": 0.78259927, + "learning_rate": 3.383315249674944e-06, + "loss": 0.80469477, + "num_input_tokens_seen": 49850860, + "step": 2322, + "time_per_iteration": 2.7885444164276123 + }, + { + "auxiliary_loss_clip": 0.01122072, + "auxiliary_loss_mlp": 0.01087887, + "balance_loss_clip": 1.03247595, + "balance_loss_mlp": 1.00639534, + "epoch": 0.2793242349546083, + "flos": 25400981364480.0, + "grad_norm": 2.0402803104966387, + "language_loss": 0.86252892, + "learning_rate": 3.3827525515469715e-06, + "loss": 0.88462853, + "num_input_tokens_seen": 49865765, + "step": 2323, + "time_per_iteration": 2.8117339611053467 + }, + { + "auxiliary_loss_clip": 0.01120977, + "auxiliary_loss_mlp": 0.01087742, + "balance_loss_clip": 1.03164971, + "balance_loss_mlp": 1.0062983, + "epoch": 0.2794444778452474, + "flos": 20850346298880.0, + "grad_norm": 2.1825826636575827, + "language_loss": 0.70798063, + "learning_rate": 3.3821896436565367e-06, + "loss": 0.73006779, + "num_input_tokens_seen": 49885425, + "step": 2324, + "time_per_iteration": 2.8615036010742188 + }, + { + "auxiliary_loss_clip": 0.01142959, + "auxiliary_loss_mlp": 0.01088694, + "balance_loss_clip": 1.03504682, + "balance_loss_mlp": 1.00724995, + "epoch": 0.2795647207358865, + "flos": 21576244250880.0, + "grad_norm": 1.8446297209022677, + "language_loss": 0.70597208, + "learning_rate": 3.381626526089032e-06, + "loss": 0.72828865, + "num_input_tokens_seen": 49904990, + "step": 2325, + "time_per_iteration": 2.7001266479492188 + }, + { + "auxiliary_loss_clip": 0.01131019, + "auxiliary_loss_mlp": 0.01086923, + "balance_loss_clip": 1.03092039, + "balance_loss_mlp": 1.00547957, + "epoch": 0.27968496362652556, + "flos": 21471744608640.0, + "grad_norm": 2.1128281911144087, + "language_loss": 0.79432952, + "learning_rate": 3.3810631989298815e-06, + "loss": 0.81650889, + "num_input_tokens_seen": 49924600, + "step": 2326, + "time_per_iteration": 3.6995582580566406 + }, + { + "auxiliary_loss_clip": 0.01110787, + "auxiliary_loss_mlp": 0.01088759, + "balance_loss_clip": 1.0299921, + "balance_loss_mlp": 1.00726712, + "epoch": 0.2798052065171647, + "flos": 23258695340160.0, + "grad_norm": 2.3883178993921685, + "language_loss": 0.85047567, + "learning_rate": 3.3804996622645423e-06, + "loss": 0.87247109, + "num_input_tokens_seen": 49942600, + "step": 2327, + "time_per_iteration": 2.85036563873291 + }, + { + "auxiliary_loss_clip": 0.01151974, + "auxiliary_loss_mlp": 0.01088475, + "balance_loss_clip": 1.0353837, + "balance_loss_mlp": 1.00688839, + "epoch": 0.2799254494078038, + "flos": 21539328048000.0, + "grad_norm": 1.76819034289687, + "language_loss": 0.89503455, + "learning_rate": 3.3799359161785015e-06, + "loss": 0.9174391, + "num_input_tokens_seen": 49962250, + "step": 2328, + "time_per_iteration": 2.748054027557373 + }, + { + "auxiliary_loss_clip": 0.01138647, + "auxiliary_loss_mlp": 0.01087809, + "balance_loss_clip": 1.03274477, + "balance_loss_mlp": 1.00636542, + "epoch": 0.28004569229844284, + "flos": 26393912000640.0, + "grad_norm": 1.509215802546721, + "language_loss": 0.85691774, + "learning_rate": 3.3793719607572798e-06, + "loss": 0.87918222, + "num_input_tokens_seen": 49983215, + "step": 2329, + "time_per_iteration": 2.8059470653533936 + }, + { + "auxiliary_loss_clip": 0.0113459, + "auxiliary_loss_mlp": 0.01089568, + "balance_loss_clip": 1.03421378, + "balance_loss_mlp": 1.0079813, + "epoch": 0.28016593518908195, + "flos": 33547676584320.0, + "grad_norm": 2.2346943451610866, + "language_loss": 0.77480614, + "learning_rate": 3.378807796086428e-06, + "loss": 0.79704773, + "num_input_tokens_seen": 50006075, + "step": 2330, + "time_per_iteration": 3.882307291030884 + }, + { + "auxiliary_loss_clip": 0.01152144, + "auxiliary_loss_mlp": 0.01087777, + "balance_loss_clip": 1.0359031, + "balance_loss_mlp": 1.00633287, + "epoch": 0.28028617807972106, + "flos": 15340823712000.0, + "grad_norm": 2.168828433294927, + "language_loss": 0.77024001, + "learning_rate": 3.37824342225153e-06, + "loss": 0.79263926, + "num_input_tokens_seen": 50022495, + "step": 2331, + "time_per_iteration": 2.662355422973633 + }, + { + "auxiliary_loss_clip": 0.01111428, + "auxiliary_loss_mlp": 0.01087139, + "balance_loss_clip": 1.03119004, + "balance_loss_mlp": 1.00574279, + "epoch": 0.2804064209703601, + "flos": 25520277409920.0, + "grad_norm": 1.8372185986921594, + "language_loss": 0.77792394, + "learning_rate": 3.3776788393382006e-06, + "loss": 0.79990959, + "num_input_tokens_seen": 50041975, + "step": 2332, + "time_per_iteration": 3.8499629497528076 + }, + { + "auxiliary_loss_clip": 0.01151874, + "auxiliary_loss_mlp": 0.01088731, + "balance_loss_clip": 1.0356096, + "balance_loss_mlp": 1.00719166, + "epoch": 0.2805266638609992, + "flos": 29351766280320.0, + "grad_norm": 2.098130866396286, + "language_loss": 0.75974703, + "learning_rate": 3.3771140474320872e-06, + "loss": 0.78215307, + "num_input_tokens_seen": 50061925, + "step": 2333, + "time_per_iteration": 2.7482964992523193 + }, + { + "auxiliary_loss_clip": 0.01120948, + "auxiliary_loss_mlp": 0.01086552, + "balance_loss_clip": 1.03063321, + "balance_loss_mlp": 1.00520384, + "epoch": 0.28064690675163834, + "flos": 21463735875840.0, + "grad_norm": 2.2201092266467373, + "language_loss": 0.79646289, + "learning_rate": 3.3765490466188664e-06, + "loss": 0.81853789, + "num_input_tokens_seen": 50079325, + "step": 2334, + "time_per_iteration": 3.816721200942993 + }, + { + "auxiliary_loss_clip": 0.01123983, + "auxiliary_loss_mlp": 0.01086305, + "balance_loss_clip": 1.03290176, + "balance_loss_mlp": 1.00495601, + "epoch": 0.2807671496422774, + "flos": 20995640812800.0, + "grad_norm": 2.486689289317471, + "language_loss": 0.74054742, + "learning_rate": 3.3759838369842508e-06, + "loss": 0.76265031, + "num_input_tokens_seen": 50097400, + "step": 2335, + "time_per_iteration": 2.829052448272705 + }, + { + "auxiliary_loss_clip": 0.01116844, + "auxiliary_loss_mlp": 0.01087655, + "balance_loss_clip": 1.02819645, + "balance_loss_mlp": 1.00611544, + "epoch": 0.2808873925329165, + "flos": 21506577822720.0, + "grad_norm": 4.4267222502206085, + "language_loss": 0.73073828, + "learning_rate": 3.375418418613981e-06, + "loss": 0.7527833, + "num_input_tokens_seen": 50116425, + "step": 2336, + "time_per_iteration": 2.758295774459839 + }, + { + "auxiliary_loss_clip": 0.0113002, + "auxiliary_loss_mlp": 0.01087404, + "balance_loss_clip": 1.03134573, + "balance_loss_mlp": 1.00596046, + "epoch": 0.28100763542355556, + "flos": 16070815814400.0, + "grad_norm": 2.678150537333464, + "language_loss": 0.83151728, + "learning_rate": 3.374852791593831e-06, + "loss": 0.85369146, + "num_input_tokens_seen": 50132625, + "step": 2337, + "time_per_iteration": 2.7574594020843506 + }, + { + "auxiliary_loss_clip": 0.0110387, + "auxiliary_loss_mlp": 0.01087167, + "balance_loss_clip": 1.03305602, + "balance_loss_mlp": 1.00581908, + "epoch": 0.28112787831419467, + "flos": 19062605468160.0, + "grad_norm": 2.9417950695331854, + "language_loss": 0.53549945, + "learning_rate": 3.374286956009605e-06, + "loss": 0.55740982, + "num_input_tokens_seen": 50151190, + "step": 2338, + "time_per_iteration": 2.7654049396514893 + }, + { + "auxiliary_loss_clip": 0.0114023, + "auxiliary_loss_mlp": 0.01089528, + "balance_loss_clip": 1.03484106, + "balance_loss_mlp": 1.00817943, + "epoch": 0.2812481212048338, + "flos": 12823629482880.0, + "grad_norm": 2.0009244780340403, + "language_loss": 0.75189447, + "learning_rate": 3.3737209119471405e-06, + "loss": 0.77419204, + "num_input_tokens_seen": 50167700, + "step": 2339, + "time_per_iteration": 2.706228256225586 + }, + { + "auxiliary_loss_clip": 0.0114376, + "auxiliary_loss_mlp": 0.01090112, + "balance_loss_clip": 1.03504634, + "balance_loss_mlp": 1.00838208, + "epoch": 0.28136836409547283, + "flos": 15633064765440.0, + "grad_norm": 2.2146722350045858, + "language_loss": 0.63354421, + "learning_rate": 3.373154659492306e-06, + "loss": 0.65588295, + "num_input_tokens_seen": 50185840, + "step": 2340, + "time_per_iteration": 2.678342342376709 + }, + { + "auxiliary_loss_clip": 0.01122407, + "auxiliary_loss_mlp": 0.01089809, + "balance_loss_clip": 1.03437221, + "balance_loss_mlp": 1.00846004, + "epoch": 0.28148860698611194, + "flos": 19933726106880.0, + "grad_norm": 1.8236887191240752, + "language_loss": 0.84992516, + "learning_rate": 3.3725881987310016e-06, + "loss": 0.87204731, + "num_input_tokens_seen": 50203375, + "step": 2341, + "time_per_iteration": 2.7224831581115723 + }, + { + "auxiliary_loss_clip": 0.0113004, + "auxiliary_loss_mlp": 0.01086732, + "balance_loss_clip": 1.03144586, + "balance_loss_mlp": 1.005288, + "epoch": 0.28160884987675106, + "flos": 17457219008640.0, + "grad_norm": 1.8642950631468749, + "language_loss": 0.87624145, + "learning_rate": 3.372021529749159e-06, + "loss": 0.89840913, + "num_input_tokens_seen": 50222435, + "step": 2342, + "time_per_iteration": 2.752685070037842 + }, + { + "auxiliary_loss_clip": 0.01087809, + "auxiliary_loss_mlp": 0.0108867, + "balance_loss_clip": 1.02912819, + "balance_loss_mlp": 1.00736904, + "epoch": 0.2817290927673901, + "flos": 16834743290880.0, + "grad_norm": 2.783127619432597, + "language_loss": 0.92401201, + "learning_rate": 3.3714546526327405e-06, + "loss": 0.94577682, + "num_input_tokens_seen": 50240435, + "step": 2343, + "time_per_iteration": 2.8798909187316895 + }, + { + "auxiliary_loss_clip": 0.01123847, + "auxiliary_loss_mlp": 0.01089324, + "balance_loss_clip": 1.03281307, + "balance_loss_mlp": 1.00764203, + "epoch": 0.2818493356580292, + "flos": 15414081500160.0, + "grad_norm": 2.0831749487425246, + "language_loss": 0.87700343, + "learning_rate": 3.3708875674677423e-06, + "loss": 0.89913511, + "num_input_tokens_seen": 50258410, + "step": 2344, + "time_per_iteration": 2.8756699562072754 + }, + { + "auxiliary_loss_clip": 0.01123928, + "auxiliary_loss_mlp": 0.01086993, + "balance_loss_clip": 1.03527606, + "balance_loss_mlp": 1.00531054, + "epoch": 0.28196957854866833, + "flos": 20412451595520.0, + "grad_norm": 2.285937504411886, + "language_loss": 0.82998574, + "learning_rate": 3.37032027434019e-06, + "loss": 0.85209495, + "num_input_tokens_seen": 50277930, + "step": 2345, + "time_per_iteration": 2.8950178623199463 + }, + { + "auxiliary_loss_clip": 0.0114333, + "auxiliary_loss_mlp": 0.01089421, + "balance_loss_clip": 1.03380609, + "balance_loss_mlp": 1.00754762, + "epoch": 0.2820898214393074, + "flos": 19973120348160.0, + "grad_norm": 1.7662184175649254, + "language_loss": 0.8276819, + "learning_rate": 3.369752773336141e-06, + "loss": 0.85000932, + "num_input_tokens_seen": 50297410, + "step": 2346, + "time_per_iteration": 2.825484037399292 + }, + { + "auxiliary_loss_clip": 0.0113318, + "auxiliary_loss_mlp": 0.01088534, + "balance_loss_clip": 1.0339098, + "balance_loss_mlp": 1.00675678, + "epoch": 0.2822100643299465, + "flos": 22528308188160.0, + "grad_norm": 1.6882846271165513, + "language_loss": 0.78420812, + "learning_rate": 3.3691850645416864e-06, + "loss": 0.80642527, + "num_input_tokens_seen": 50317120, + "step": 2347, + "time_per_iteration": 2.7488603591918945 + }, + { + "auxiliary_loss_clip": 0.01141799, + "auxiliary_loss_mlp": 0.01088891, + "balance_loss_clip": 1.03305197, + "balance_loss_mlp": 1.0074476, + "epoch": 0.2823303072205856, + "flos": 11546682007680.0, + "grad_norm": 2.0112197648453924, + "language_loss": 0.83058381, + "learning_rate": 3.368617148042945e-06, + "loss": 0.85289073, + "num_input_tokens_seen": 50334790, + "step": 2348, + "time_per_iteration": 2.705869197845459 + }, + { + "auxiliary_loss_clip": 0.0113473, + "auxiliary_loss_mlp": 0.01090534, + "balance_loss_clip": 1.03361917, + "balance_loss_mlp": 1.00889945, + "epoch": 0.28245055011122466, + "flos": 18259894281600.0, + "grad_norm": 2.955571042035983, + "language_loss": 0.84734714, + "learning_rate": 3.368049023926071e-06, + "loss": 0.86959976, + "num_input_tokens_seen": 50353785, + "step": 2349, + "time_per_iteration": 2.725085496902466 + }, + { + "auxiliary_loss_clip": 0.01142832, + "auxiliary_loss_mlp": 0.01088681, + "balance_loss_clip": 1.03507185, + "balance_loss_mlp": 1.00728512, + "epoch": 0.2825707930018638, + "flos": 24608110504320.0, + "grad_norm": 1.6349691007663614, + "language_loss": 0.83566511, + "learning_rate": 3.3674806922772476e-06, + "loss": 0.85798025, + "num_input_tokens_seen": 50374670, + "step": 2350, + "time_per_iteration": 2.7328898906707764 + }, + { + "auxiliary_loss_clip": 0.01112673, + "auxiliary_loss_mlp": 0.01089366, + "balance_loss_clip": 1.03307652, + "balance_loss_mlp": 1.00797009, + "epoch": 0.28269103589250283, + "flos": 25226994862080.0, + "grad_norm": 1.6850261254960968, + "language_loss": 0.74762172, + "learning_rate": 3.3669121531826904e-06, + "loss": 0.76964211, + "num_input_tokens_seen": 50395650, + "step": 2351, + "time_per_iteration": 2.8798446655273438 + }, + { + "auxiliary_loss_clip": 0.01117164, + "auxiliary_loss_mlp": 0.01087996, + "balance_loss_clip": 1.02941799, + "balance_loss_mlp": 1.00655198, + "epoch": 0.28281127878314194, + "flos": 19281552819840.0, + "grad_norm": 2.0563748854461865, + "language_loss": 0.8320682, + "learning_rate": 3.366343406728647e-06, + "loss": 0.85411978, + "num_input_tokens_seen": 50415100, + "step": 2352, + "time_per_iteration": 3.643009662628174 + }, + { + "auxiliary_loss_clip": 0.01141381, + "auxiliary_loss_mlp": 0.01086536, + "balance_loss_clip": 1.03236914, + "balance_loss_mlp": 1.00504434, + "epoch": 0.28293152167378105, + "flos": 23878405710720.0, + "grad_norm": 1.8221438431474348, + "language_loss": 0.68328714, + "learning_rate": 3.3657744530013946e-06, + "loss": 0.70556629, + "num_input_tokens_seen": 50434335, + "step": 2353, + "time_per_iteration": 2.6969101428985596 + }, + { + "auxiliary_loss_clip": 0.01142972, + "auxiliary_loss_mlp": 0.01088742, + "balance_loss_clip": 1.03446794, + "balance_loss_mlp": 1.00725055, + "epoch": 0.2830517645644201, + "flos": 43866965928960.0, + "grad_norm": 3.128546197328839, + "language_loss": 0.70820248, + "learning_rate": 3.3652052920872437e-06, + "loss": 0.73051953, + "num_input_tokens_seen": 50457200, + "step": 2354, + "time_per_iteration": 2.82463002204895 + }, + { + "auxiliary_loss_clip": 0.01129403, + "auxiliary_loss_mlp": 0.01087367, + "balance_loss_clip": 1.03032398, + "balance_loss_mlp": 1.00573218, + "epoch": 0.2831720074550592, + "flos": 26651750803200.0, + "grad_norm": 2.015661219640529, + "language_loss": 0.85650259, + "learning_rate": 3.3646359240725355e-06, + "loss": 0.87867028, + "num_input_tokens_seen": 50476390, + "step": 2355, + "time_per_iteration": 3.661747932434082 + }, + { + "auxiliary_loss_clip": 0.01142493, + "auxiliary_loss_mlp": 0.00873831, + "balance_loss_clip": 1.03406549, + "balance_loss_mlp": 1.0001725, + "epoch": 0.2832922503456983, + "flos": 31029979564800.0, + "grad_norm": 1.83541529858414, + "language_loss": 0.67985141, + "learning_rate": 3.364066349043643e-06, + "loss": 0.70001465, + "num_input_tokens_seen": 50497595, + "step": 2356, + "time_per_iteration": 2.8620715141296387 + }, + { + "auxiliary_loss_clip": 0.01130887, + "auxiliary_loss_mlp": 0.01088087, + "balance_loss_clip": 1.03215897, + "balance_loss_mlp": 1.00673842, + "epoch": 0.2834124932363374, + "flos": 20405699838720.0, + "grad_norm": 1.7005036520231684, + "language_loss": 0.81969273, + "learning_rate": 3.363496567086969e-06, + "loss": 0.84188247, + "num_input_tokens_seen": 50514690, + "step": 2357, + "time_per_iteration": 3.764211654663086 + }, + { + "auxiliary_loss_clip": 0.01153232, + "auxiliary_loss_mlp": 0.01090676, + "balance_loss_clip": 1.03658569, + "balance_loss_mlp": 1.0092324, + "epoch": 0.2835327361269765, + "flos": 39384848056320.0, + "grad_norm": 1.9244342589452896, + "language_loss": 0.75319284, + "learning_rate": 3.3629265782889506e-06, + "loss": 0.7756319, + "num_input_tokens_seen": 50536515, + "step": 2358, + "time_per_iteration": 3.723461866378784 + }, + { + "auxiliary_loss_clip": 0.01122875, + "auxiliary_loss_mlp": 0.01087205, + "balance_loss_clip": 1.03119671, + "balance_loss_mlp": 1.00576091, + "epoch": 0.2836529790176156, + "flos": 30261598801920.0, + "grad_norm": 1.6928216877049274, + "language_loss": 0.71913052, + "learning_rate": 3.362356382736054e-06, + "loss": 0.74123132, + "num_input_tokens_seen": 50557120, + "step": 2359, + "time_per_iteration": 2.91104793548584 + }, + { + "auxiliary_loss_clip": 0.01110256, + "auxiliary_loss_mlp": 0.01087723, + "balance_loss_clip": 1.0304513, + "balance_loss_mlp": 1.00642264, + "epoch": 0.28377322190825466, + "flos": 12677796264960.0, + "grad_norm": 2.0841026010738735, + "language_loss": 0.91040516, + "learning_rate": 3.361785980514777e-06, + "loss": 0.93238497, + "num_input_tokens_seen": 50573320, + "step": 2360, + "time_per_iteration": 2.740525484085083 + }, + { + "auxiliary_loss_clip": 0.010941, + "auxiliary_loss_mlp": 0.01087972, + "balance_loss_clip": 1.02511811, + "balance_loss_mlp": 1.00648069, + "epoch": 0.28389346479889377, + "flos": 18296666830080.0, + "grad_norm": 3.340087233456076, + "language_loss": 0.76344788, + "learning_rate": 3.361215371711649e-06, + "loss": 0.78526866, + "num_input_tokens_seen": 50592415, + "step": 2361, + "time_per_iteration": 2.840730667114258 + }, + { + "auxiliary_loss_clip": 0.01118228, + "auxiliary_loss_mlp": 0.01088634, + "balance_loss_clip": 1.03028035, + "balance_loss_mlp": 1.00728512, + "epoch": 0.2840137076895329, + "flos": 20406992728320.0, + "grad_norm": 1.6652008899122224, + "language_loss": 0.83323282, + "learning_rate": 3.3606445564132326e-06, + "loss": 0.8553015, + "num_input_tokens_seen": 50609710, + "step": 2362, + "time_per_iteration": 2.7945234775543213 + }, + { + "auxiliary_loss_clip": 0.01152181, + "auxiliary_loss_mlp": 0.00873753, + "balance_loss_clip": 1.03553605, + "balance_loss_mlp": 1.00019169, + "epoch": 0.28413395058017193, + "flos": 20048030161920.0, + "grad_norm": 2.140935141325954, + "language_loss": 0.8195467, + "learning_rate": 3.360073534706118e-06, + "loss": 0.83980608, + "num_input_tokens_seen": 50626865, + "step": 2363, + "time_per_iteration": 2.6512234210968018 + }, + { + "auxiliary_loss_clip": 0.01129996, + "auxiliary_loss_mlp": 0.01088792, + "balance_loss_clip": 1.03130436, + "balance_loss_mlp": 1.00725293, + "epoch": 0.28425419347081105, + "flos": 37663613256960.0, + "grad_norm": 1.846767769777107, + "language_loss": 0.75859094, + "learning_rate": 3.35950230667693e-06, + "loss": 0.78077877, + "num_input_tokens_seen": 50648560, + "step": 2364, + "time_per_iteration": 2.8657209873199463 + }, + { + "auxiliary_loss_clip": 0.01141324, + "auxiliary_loss_mlp": 0.01087894, + "balance_loss_clip": 1.03301096, + "balance_loss_mlp": 1.00640225, + "epoch": 0.28437443636145016, + "flos": 13845072539520.0, + "grad_norm": 2.0771561332354613, + "language_loss": 0.86199856, + "learning_rate": 3.358930872412323e-06, + "loss": 0.8842907, + "num_input_tokens_seen": 50665725, + "step": 2365, + "time_per_iteration": 2.634852170944214 + }, + { + "auxiliary_loss_clip": 0.01139648, + "auxiliary_loss_mlp": 0.01088077, + "balance_loss_clip": 1.03295255, + "balance_loss_mlp": 1.00668108, + "epoch": 0.2844946792520892, + "flos": 22747794243840.0, + "grad_norm": 5.126282018374334, + "language_loss": 0.81046188, + "learning_rate": 3.3583592319989825e-06, + "loss": 0.83273911, + "num_input_tokens_seen": 50685095, + "step": 2366, + "time_per_iteration": 2.7355594635009766 + }, + { + "auxiliary_loss_clip": 0.01144589, + "auxiliary_loss_mlp": 0.01088659, + "balance_loss_clip": 1.03552365, + "balance_loss_mlp": 1.00697708, + "epoch": 0.2846149221427283, + "flos": 32415987709440.0, + "grad_norm": 2.3537841179070176, + "language_loss": 0.68425322, + "learning_rate": 3.357787385523627e-06, + "loss": 0.70658565, + "num_input_tokens_seen": 50706500, + "step": 2367, + "time_per_iteration": 2.7375271320343018 + }, + { + "auxiliary_loss_clip": 0.01102339, + "auxiliary_loss_mlp": 0.01087451, + "balance_loss_clip": 1.02804124, + "balance_loss_mlp": 1.00595999, + "epoch": 0.2847351650333674, + "flos": 28475976873600.0, + "grad_norm": 1.7870179237560353, + "language_loss": 0.82562768, + "learning_rate": 3.3572153330730048e-06, + "loss": 0.8475256, + "num_input_tokens_seen": 50727595, + "step": 2368, + "time_per_iteration": 2.912404775619507 + }, + { + "auxiliary_loss_clip": 0.01118122, + "auxiliary_loss_mlp": 0.01084956, + "balance_loss_clip": 1.04082608, + "balance_loss_mlp": 1.0052284, + "epoch": 0.2848554079240065, + "flos": 55753399704960.0, + "grad_norm": 0.8333932745354102, + "language_loss": 0.64725584, + "learning_rate": 3.3566430747338956e-06, + "loss": 0.66928661, + "num_input_tokens_seen": 50782800, + "step": 2369, + "time_per_iteration": 3.160163164138794 + }, + { + "auxiliary_loss_clip": 0.01140215, + "auxiliary_loss_mlp": 0.01087685, + "balance_loss_clip": 1.03156948, + "balance_loss_mlp": 1.00628889, + "epoch": 0.2849756508146456, + "flos": 11836875985920.0, + "grad_norm": 1.9968437579148794, + "language_loss": 0.86602628, + "learning_rate": 3.35607061059311e-06, + "loss": 0.88830531, + "num_input_tokens_seen": 50797730, + "step": 2370, + "time_per_iteration": 2.6832962036132812 + }, + { + "auxiliary_loss_clip": 0.01152769, + "auxiliary_loss_mlp": 0.01089279, + "balance_loss_clip": 1.03736281, + "balance_loss_mlp": 1.00793028, + "epoch": 0.28509589370528465, + "flos": 25155209531520.0, + "grad_norm": 1.7048632976932676, + "language_loss": 0.74847049, + "learning_rate": 3.3554979407374917e-06, + "loss": 0.77089095, + "num_input_tokens_seen": 50819840, + "step": 2371, + "time_per_iteration": 2.7477502822875977 + }, + { + "auxiliary_loss_clip": 0.01141115, + "auxiliary_loss_mlp": 0.01089383, + "balance_loss_clip": 1.03303599, + "balance_loss_mlp": 1.00808263, + "epoch": 0.28521613659592376, + "flos": 19974808287360.0, + "grad_norm": 3.2398107674809364, + "language_loss": 0.7368542, + "learning_rate": 3.3549250652539134e-06, + "loss": 0.75915909, + "num_input_tokens_seen": 50838935, + "step": 2372, + "time_per_iteration": 2.6816511154174805 + }, + { + "auxiliary_loss_clip": 0.01132683, + "auxiliary_loss_mlp": 0.01088733, + "balance_loss_clip": 1.03229737, + "balance_loss_mlp": 1.00719357, + "epoch": 0.2853363794865629, + "flos": 23367971491200.0, + "grad_norm": 1.7221033246755102, + "language_loss": 0.81370759, + "learning_rate": 3.3543519842292794e-06, + "loss": 0.83592176, + "num_input_tokens_seen": 50858590, + "step": 2373, + "time_per_iteration": 2.8647403717041016 + }, + { + "auxiliary_loss_clip": 0.0115299, + "auxiliary_loss_mlp": 0.0087377, + "balance_loss_clip": 1.03628349, + "balance_loss_mlp": 1.00015044, + "epoch": 0.28545662237720193, + "flos": 19861940776320.0, + "grad_norm": 1.6952787081196206, + "language_loss": 0.83654052, + "learning_rate": 3.353778697750527e-06, + "loss": 0.85680807, + "num_input_tokens_seen": 50876995, + "step": 2374, + "time_per_iteration": 2.6133482456207275 + }, + { + "auxiliary_loss_clip": 0.01132481, + "auxiliary_loss_mlp": 0.01087246, + "balance_loss_clip": 1.033427, + "balance_loss_mlp": 1.00565898, + "epoch": 0.28557686526784104, + "flos": 23879016241920.0, + "grad_norm": 1.6334953283845777, + "language_loss": 0.89417922, + "learning_rate": 3.353205205904622e-06, + "loss": 0.91637641, + "num_input_tokens_seen": 50896105, + "step": 2375, + "time_per_iteration": 2.732997417449951 + }, + { + "auxiliary_loss_clip": 0.0112994, + "auxiliary_loss_mlp": 0.01086855, + "balance_loss_clip": 1.03181016, + "balance_loss_mlp": 1.00536311, + "epoch": 0.28569710815848015, + "flos": 44890384233600.0, + "grad_norm": 1.7559232386476333, + "language_loss": 0.72182143, + "learning_rate": 3.3526315087785637e-06, + "loss": 0.74398941, + "num_input_tokens_seen": 50917220, + "step": 2376, + "time_per_iteration": 2.8990025520324707 + }, + { + "auxiliary_loss_clip": 0.01104405, + "auxiliary_loss_mlp": 0.01087835, + "balance_loss_clip": 1.03002727, + "balance_loss_mlp": 1.00643933, + "epoch": 0.2858173510491192, + "flos": 26829759628800.0, + "grad_norm": 1.5747619477988914, + "language_loss": 0.81299454, + "learning_rate": 3.3520576064593805e-06, + "loss": 0.83491695, + "num_input_tokens_seen": 50937175, + "step": 2377, + "time_per_iteration": 3.765629529953003 + }, + { + "auxiliary_loss_clip": 0.01143873, + "auxiliary_loss_mlp": 0.010891, + "balance_loss_clip": 1.03530407, + "balance_loss_mlp": 1.00756109, + "epoch": 0.2859375939397583, + "flos": 23148916398720.0, + "grad_norm": 1.3921739809813911, + "language_loss": 0.81747818, + "learning_rate": 3.3514834990341337e-06, + "loss": 0.83980787, + "num_input_tokens_seen": 50957500, + "step": 2378, + "time_per_iteration": 2.7308225631713867 + }, + { + "auxiliary_loss_clip": 0.0113379, + "auxiliary_loss_mlp": 0.01087746, + "balance_loss_clip": 1.03373361, + "balance_loss_mlp": 1.00639737, + "epoch": 0.2860578368303974, + "flos": 12129799397760.0, + "grad_norm": 4.303686574044318, + "language_loss": 0.93061215, + "learning_rate": 3.3509091865899144e-06, + "loss": 0.95282751, + "num_input_tokens_seen": 50972690, + "step": 2379, + "time_per_iteration": 2.7192561626434326 + }, + { + "auxiliary_loss_clip": 0.01151004, + "auxiliary_loss_mlp": 0.01088566, + "balance_loss_clip": 1.03428864, + "balance_loss_mlp": 1.0069313, + "epoch": 0.2861780797210365, + "flos": 19938035738880.0, + "grad_norm": 2.0875841031647395, + "language_loss": 0.7070992, + "learning_rate": 3.350334669213846e-06, + "loss": 0.72949493, + "num_input_tokens_seen": 50990095, + "step": 2380, + "time_per_iteration": 3.5825231075286865 + }, + { + "auxiliary_loss_clip": 0.01141005, + "auxiliary_loss_mlp": 0.01088326, + "balance_loss_clip": 1.03457499, + "balance_loss_mlp": 1.00693011, + "epoch": 0.2862983226116756, + "flos": 27563127609600.0, + "grad_norm": 2.1764876068012367, + "language_loss": 0.75614548, + "learning_rate": 3.3497599469930816e-06, + "loss": 0.77843881, + "num_input_tokens_seen": 51008305, + "step": 2381, + "time_per_iteration": 2.707429885864258 + }, + { + "auxiliary_loss_clip": 0.01150624, + "auxiliary_loss_mlp": 0.01088231, + "balance_loss_clip": 1.03403163, + "balance_loss_mlp": 1.00664461, + "epoch": 0.28641856550231465, + "flos": 22053964158720.0, + "grad_norm": 2.4822859664927526, + "language_loss": 0.83091402, + "learning_rate": 3.349185020014807e-06, + "loss": 0.8533026, + "num_input_tokens_seen": 51025570, + "step": 2382, + "time_per_iteration": 3.6198172569274902 + }, + { + "auxiliary_loss_clip": 0.01140638, + "auxiliary_loss_mlp": 0.01088682, + "balance_loss_clip": 1.0329442, + "balance_loss_mlp": 1.0070951, + "epoch": 0.28653880839295376, + "flos": 22378775869440.0, + "grad_norm": 1.7208850251466958, + "language_loss": 0.74513924, + "learning_rate": 3.348609888366237e-06, + "loss": 0.76743245, + "num_input_tokens_seen": 51044585, + "step": 2383, + "time_per_iteration": 3.5841543674468994 + }, + { + "auxiliary_loss_clip": 0.01096893, + "auxiliary_loss_mlp": 0.01087737, + "balance_loss_clip": 1.02557921, + "balance_loss_mlp": 1.00638914, + "epoch": 0.28665905128359287, + "flos": 23367971491200.0, + "grad_norm": 1.866529908350655, + "language_loss": 0.6293028, + "learning_rate": 3.348034552134619e-06, + "loss": 0.65114909, + "num_input_tokens_seen": 51063990, + "step": 2384, + "time_per_iteration": 2.823188304901123 + }, + { + "auxiliary_loss_clip": 0.01104672, + "auxiliary_loss_mlp": 0.01087364, + "balance_loss_clip": 1.02646542, + "balance_loss_mlp": 1.00592041, + "epoch": 0.2867792941742319, + "flos": 20881695893760.0, + "grad_norm": 1.8832444611606238, + "language_loss": 0.84130168, + "learning_rate": 3.3474590114072316e-06, + "loss": 0.86322212, + "num_input_tokens_seen": 51081990, + "step": 2385, + "time_per_iteration": 2.784257650375366 + }, + { + "auxiliary_loss_clip": 0.01115955, + "auxiliary_loss_mlp": 0.01088684, + "balance_loss_clip": 1.02889788, + "balance_loss_mlp": 1.00704908, + "epoch": 0.28689953706487104, + "flos": 20664005518080.0, + "grad_norm": 1.699358267223278, + "language_loss": 0.82742727, + "learning_rate": 3.3468832662713836e-06, + "loss": 0.8494736, + "num_input_tokens_seen": 51100235, + "step": 2386, + "time_per_iteration": 2.767282247543335 + }, + { + "auxiliary_loss_clip": 0.01120863, + "auxiliary_loss_mlp": 0.01089967, + "balance_loss_clip": 1.03173447, + "balance_loss_mlp": 1.00847554, + "epoch": 0.28701977995551015, + "flos": 12675533708160.0, + "grad_norm": 2.2593514265213837, + "language_loss": 0.83597142, + "learning_rate": 3.346307316814415e-06, + "loss": 0.85807979, + "num_input_tokens_seen": 51115405, + "step": 2387, + "time_per_iteration": 2.7201809883117676 + }, + { + "auxiliary_loss_clip": 0.0113359, + "auxiliary_loss_mlp": 0.0109035, + "balance_loss_clip": 1.03262675, + "balance_loss_mlp": 1.00881052, + "epoch": 0.2871400228461492, + "flos": 21252366293760.0, + "grad_norm": 1.8522112965462472, + "language_loss": 0.75964284, + "learning_rate": 3.3457311631236965e-06, + "loss": 0.78188229, + "num_input_tokens_seen": 51136390, + "step": 2388, + "time_per_iteration": 2.8525869846343994 + }, + { + "auxiliary_loss_clip": 0.0113156, + "auxiliary_loss_mlp": 0.01089599, + "balance_loss_clip": 1.03287935, + "balance_loss_mlp": 1.00791693, + "epoch": 0.2872602657367883, + "flos": 25119262995840.0, + "grad_norm": 1.799506571934014, + "language_loss": 0.8463499, + "learning_rate": 3.345154805286631e-06, + "loss": 0.86856151, + "num_input_tokens_seen": 51156650, + "step": 2389, + "time_per_iteration": 2.7475497722625732 + }, + { + "auxiliary_loss_clip": 0.01142518, + "auxiliary_loss_mlp": 0.01089255, + "balance_loss_clip": 1.03365064, + "balance_loss_mlp": 1.00766814, + "epoch": 0.2873805086274274, + "flos": 16646606830080.0, + "grad_norm": 2.5664168492342356, + "language_loss": 0.76063168, + "learning_rate": 3.344578243390651e-06, + "loss": 0.78294933, + "num_input_tokens_seen": 51172210, + "step": 2390, + "time_per_iteration": 2.671415328979492 + }, + { + "auxiliary_loss_clip": 0.01128171, + "auxiliary_loss_mlp": 0.01087158, + "balance_loss_clip": 1.03165054, + "balance_loss_mlp": 1.00566673, + "epoch": 0.2875007515180665, + "flos": 17420123237760.0, + "grad_norm": 2.0723728446302268, + "language_loss": 0.79009759, + "learning_rate": 3.3440014775232206e-06, + "loss": 0.81225085, + "num_input_tokens_seen": 51190265, + "step": 2391, + "time_per_iteration": 2.693671464920044 + }, + { + "auxiliary_loss_clip": 0.01110436, + "auxiliary_loss_mlp": 0.01088428, + "balance_loss_clip": 1.03172445, + "balance_loss_mlp": 1.00712752, + "epoch": 0.2876209944087056, + "flos": 23434190213760.0, + "grad_norm": 1.9032974734546828, + "language_loss": 0.70737845, + "learning_rate": 3.343424507771834e-06, + "loss": 0.72936708, + "num_input_tokens_seen": 51208475, + "step": 2392, + "time_per_iteration": 2.7405433654785156 + }, + { + "auxiliary_loss_clip": 0.0111713, + "auxiliary_loss_mlp": 0.01086897, + "balance_loss_clip": 1.02848363, + "balance_loss_mlp": 1.00545311, + "epoch": 0.2877412372993447, + "flos": 13735509079680.0, + "grad_norm": 1.738100900778156, + "language_loss": 0.86321986, + "learning_rate": 3.342847334224018e-06, + "loss": 0.88526016, + "num_input_tokens_seen": 51225875, + "step": 2393, + "time_per_iteration": 2.728382110595703 + }, + { + "auxiliary_loss_clip": 0.01131704, + "auxiliary_loss_mlp": 0.01079891, + "balance_loss_clip": 1.03889441, + "balance_loss_mlp": 1.00016391, + "epoch": 0.28786148018998375, + "flos": 58079695104000.0, + "grad_norm": 0.9450798012440418, + "language_loss": 0.62430465, + "learning_rate": 3.342269956967329e-06, + "loss": 0.6464206, + "num_input_tokens_seen": 51287780, + "step": 2394, + "time_per_iteration": 3.3318605422973633 + }, + { + "auxiliary_loss_clip": 0.01139839, + "auxiliary_loss_mlp": 0.01089226, + "balance_loss_clip": 1.0323559, + "balance_loss_mlp": 1.00744891, + "epoch": 0.28798172308062286, + "flos": 23435052140160.0, + "grad_norm": 2.658707627323663, + "language_loss": 0.71321106, + "learning_rate": 3.341692376089355e-06, + "loss": 0.73550177, + "num_input_tokens_seen": 51303335, + "step": 2395, + "time_per_iteration": 2.6916520595550537 + }, + { + "auxiliary_loss_clip": 0.01139209, + "auxiliary_loss_mlp": 0.01087237, + "balance_loss_clip": 1.03263521, + "balance_loss_mlp": 1.00588822, + "epoch": 0.288101965971262, + "flos": 25110033200640.0, + "grad_norm": 3.374042976378868, + "language_loss": 0.84541845, + "learning_rate": 3.3411145916777146e-06, + "loss": 0.86768293, + "num_input_tokens_seen": 51317495, + "step": 2396, + "time_per_iteration": 2.702869176864624 + }, + { + "auxiliary_loss_clip": 0.01134227, + "auxiliary_loss_mlp": 0.01087964, + "balance_loss_clip": 1.03428793, + "balance_loss_mlp": 1.00637674, + "epoch": 0.28822220886190103, + "flos": 16252559654400.0, + "grad_norm": 2.1131595846403655, + "language_loss": 0.91272116, + "learning_rate": 3.3405366038200566e-06, + "loss": 0.93494308, + "num_input_tokens_seen": 51336430, + "step": 2397, + "time_per_iteration": 2.723989248275757 + }, + { + "auxiliary_loss_clip": 0.01129996, + "auxiliary_loss_mlp": 0.01090006, + "balance_loss_clip": 1.03285205, + "balance_loss_mlp": 1.00827646, + "epoch": 0.28834245175254014, + "flos": 24535642815360.0, + "grad_norm": 2.8451172360082593, + "language_loss": 0.85491318, + "learning_rate": 3.3399584126040617e-06, + "loss": 0.87711322, + "num_input_tokens_seen": 51355930, + "step": 2398, + "time_per_iteration": 2.8020477294921875 + }, + { + "auxiliary_loss_clip": 0.01150265, + "auxiliary_loss_mlp": 0.00873873, + "balance_loss_clip": 1.03377986, + "balance_loss_mlp": 1.00026155, + "epoch": 0.2884626946431792, + "flos": 24571445696640.0, + "grad_norm": 2.096107921719224, + "language_loss": 0.90981591, + "learning_rate": 3.339380018117441e-06, + "loss": 0.93005729, + "num_input_tokens_seen": 51376765, + "step": 2399, + "time_per_iteration": 2.7240853309631348 + }, + { + "auxiliary_loss_clip": 0.01140786, + "auxiliary_loss_mlp": 0.01087393, + "balance_loss_clip": 1.03418398, + "balance_loss_mlp": 1.0059973, + "epoch": 0.2885829375338183, + "flos": 16544657053440.0, + "grad_norm": 2.7843782533038848, + "language_loss": 0.78785157, + "learning_rate": 3.3388014204479366e-06, + "loss": 0.81013334, + "num_input_tokens_seen": 51394570, + "step": 2400, + "time_per_iteration": 2.670320510864258 + }, + { + "auxiliary_loss_clip": 0.01150062, + "auxiliary_loss_mlp": 0.01087517, + "balance_loss_clip": 1.03358769, + "balance_loss_mlp": 1.0057869, + "epoch": 0.2887031804244574, + "flos": 24061226958720.0, + "grad_norm": 1.9447295400088394, + "language_loss": 0.9175601, + "learning_rate": 3.338222619683321e-06, + "loss": 0.93993592, + "num_input_tokens_seen": 51414535, + "step": 2401, + "time_per_iteration": 2.687778949737549 + }, + { + "auxiliary_loss_clip": 0.01130299, + "auxiliary_loss_mlp": 0.010884, + "balance_loss_clip": 1.03154171, + "balance_loss_mlp": 1.00686038, + "epoch": 0.2888234233150965, + "flos": 23330696152320.0, + "grad_norm": 2.3701772429315238, + "language_loss": 0.73463154, + "learning_rate": 3.337643615911398e-06, + "loss": 0.75681853, + "num_input_tokens_seen": 51434160, + "step": 2402, + "time_per_iteration": 3.5916945934295654 + }, + { + "auxiliary_loss_clip": 0.01140121, + "auxiliary_loss_mlp": 0.01087682, + "balance_loss_clip": 1.03250527, + "balance_loss_mlp": 1.00604761, + "epoch": 0.2889436662057356, + "flos": 22272767856000.0, + "grad_norm": 1.8094279294729385, + "language_loss": 0.78351957, + "learning_rate": 3.3370644092200026e-06, + "loss": 0.80579758, + "num_input_tokens_seen": 51451435, + "step": 2403, + "time_per_iteration": 2.66489577293396 + }, + { + "auxiliary_loss_clip": 0.01121123, + "auxiliary_loss_mlp": 0.01088657, + "balance_loss_clip": 1.03093922, + "balance_loss_mlp": 1.00716591, + "epoch": 0.2890639090963747, + "flos": 21616931381760.0, + "grad_norm": 2.854391906536365, + "language_loss": 0.78696543, + "learning_rate": 3.3364849996969985e-06, + "loss": 0.80906326, + "num_input_tokens_seen": 51471455, + "step": 2404, + "time_per_iteration": 2.818784713745117 + }, + { + "auxiliary_loss_clip": 0.01139057, + "auxiliary_loss_mlp": 0.01087344, + "balance_loss_clip": 1.03272521, + "balance_loss_mlp": 1.00590014, + "epoch": 0.28918415198701375, + "flos": 28585540333440.0, + "grad_norm": 2.3681543043251243, + "language_loss": 0.852476, + "learning_rate": 3.335905387430283e-06, + "loss": 0.87474, + "num_input_tokens_seen": 51492890, + "step": 2405, + "time_per_iteration": 2.733757257461548 + }, + { + "auxiliary_loss_clip": 0.01130733, + "auxiliary_loss_mlp": 0.01088772, + "balance_loss_clip": 1.03089261, + "balance_loss_mlp": 1.00728083, + "epoch": 0.28930439487765286, + "flos": 21944688007680.0, + "grad_norm": 1.9191796327466604, + "language_loss": 0.82799661, + "learning_rate": 3.335325572507782e-06, + "loss": 0.85019165, + "num_input_tokens_seen": 51513390, + "step": 2406, + "time_per_iteration": 3.6608340740203857 + }, + { + "auxiliary_loss_clip": 0.0115354, + "auxiliary_loss_mlp": 0.00873847, + "balance_loss_clip": 1.03779268, + "balance_loss_mlp": 1.00028086, + "epoch": 0.28942463776829197, + "flos": 19281911955840.0, + "grad_norm": 2.481119102500207, + "language_loss": 0.73677081, + "learning_rate": 3.3347455550174537e-06, + "loss": 0.75704467, + "num_input_tokens_seen": 51532730, + "step": 2407, + "time_per_iteration": 3.922879934310913 + }, + { + "auxiliary_loss_clip": 0.01121449, + "auxiliary_loss_mlp": 0.0108715, + "balance_loss_clip": 1.03059924, + "balance_loss_mlp": 1.00556278, + "epoch": 0.289544880658931, + "flos": 14645700737280.0, + "grad_norm": 1.6877739433647017, + "language_loss": 0.67848206, + "learning_rate": 3.3341653350472864e-06, + "loss": 0.70056802, + "num_input_tokens_seen": 51549560, + "step": 2408, + "time_per_iteration": 2.7370636463165283 + }, + { + "auxiliary_loss_clip": 0.01151557, + "auxiliary_loss_mlp": 0.01090255, + "balance_loss_clip": 1.03428459, + "balance_loss_mlp": 1.00833416, + "epoch": 0.28966512354957014, + "flos": 28621881918720.0, + "grad_norm": 2.142802872381603, + "language_loss": 0.69034672, + "learning_rate": 3.333584912685298e-06, + "loss": 0.7127648, + "num_input_tokens_seen": 51568180, + "step": 2409, + "time_per_iteration": 3.6596670150756836 + }, + { + "auxiliary_loss_clip": 0.01097894, + "auxiliary_loss_mlp": 0.01081721, + "balance_loss_clip": 1.03585005, + "balance_loss_mlp": 1.00199413, + "epoch": 0.28978536644020925, + "flos": 64711784511360.0, + "grad_norm": 0.8579282516511552, + "language_loss": 0.55508995, + "learning_rate": 3.3330042880195385e-06, + "loss": 0.57688606, + "num_input_tokens_seen": 51622530, + "step": 2410, + "time_per_iteration": 3.25338077545166 + }, + { + "auxiliary_loss_clip": 0.01129614, + "auxiliary_loss_mlp": 0.01088106, + "balance_loss_clip": 1.03040326, + "balance_loss_mlp": 1.006567, + "epoch": 0.2899056093308483, + "flos": 18624638937600.0, + "grad_norm": 1.9242204681606727, + "language_loss": 0.78754056, + "learning_rate": 3.3324234611380888e-06, + "loss": 0.80971777, + "num_input_tokens_seen": 51641260, + "step": 2411, + "time_per_iteration": 2.7395803928375244 + }, + { + "auxiliary_loss_clip": 0.01113215, + "auxiliary_loss_mlp": 0.01088277, + "balance_loss_clip": 1.03071141, + "balance_loss_mlp": 1.00692844, + "epoch": 0.2900258522214874, + "flos": 22893735202560.0, + "grad_norm": 1.6891028305387956, + "language_loss": 0.821307, + "learning_rate": 3.3318424321290596e-06, + "loss": 0.84332198, + "num_input_tokens_seen": 51660975, + "step": 2412, + "time_per_iteration": 2.818162441253662 + }, + { + "auxiliary_loss_clip": 0.0109643, + "auxiliary_loss_mlp": 0.01080175, + "balance_loss_clip": 1.0300765, + "balance_loss_mlp": 1.00044775, + "epoch": 0.2901460951121265, + "flos": 71106036013440.0, + "grad_norm": 0.825332320500399, + "language_loss": 0.59997272, + "learning_rate": 3.3312612010805917e-06, + "loss": 0.62173867, + "num_input_tokens_seen": 51720550, + "step": 2413, + "time_per_iteration": 3.3597049713134766 + }, + { + "auxiliary_loss_clip": 0.01123762, + "auxiliary_loss_mlp": 0.0108991, + "balance_loss_clip": 1.03063869, + "balance_loss_mlp": 1.00827503, + "epoch": 0.2902663380027656, + "flos": 32160986081280.0, + "grad_norm": 1.6376090087352515, + "language_loss": 0.69693625, + "learning_rate": 3.330679768080858e-06, + "loss": 0.71907294, + "num_input_tokens_seen": 51744435, + "step": 2414, + "time_per_iteration": 2.811652421951294 + }, + { + "auxiliary_loss_clip": 0.01141233, + "auxiliary_loss_mlp": 0.01089985, + "balance_loss_clip": 1.03468847, + "balance_loss_mlp": 1.00844574, + "epoch": 0.2903865808934047, + "flos": 29351658539520.0, + "grad_norm": 2.466060989195073, + "language_loss": 0.83980751, + "learning_rate": 3.3300981332180627e-06, + "loss": 0.86211967, + "num_input_tokens_seen": 51763640, + "step": 2415, + "time_per_iteration": 2.801210641860962 + }, + { + "auxiliary_loss_clip": 0.01109142, + "auxiliary_loss_mlp": 0.01087265, + "balance_loss_clip": 1.03101778, + "balance_loss_mlp": 1.00572586, + "epoch": 0.29050682378404374, + "flos": 17089026647040.0, + "grad_norm": 1.8063219374758352, + "language_loss": 0.80432057, + "learning_rate": 3.3295162965804373e-06, + "loss": 0.82628459, + "num_input_tokens_seen": 51782135, + "step": 2416, + "time_per_iteration": 2.7858188152313232 + }, + { + "auxiliary_loss_clip": 0.01120632, + "auxiliary_loss_mlp": 0.01089375, + "balance_loss_clip": 1.03198755, + "balance_loss_mlp": 1.00778818, + "epoch": 0.29062706667468285, + "flos": 17858233422720.0, + "grad_norm": 2.020209320998709, + "language_loss": 0.78529704, + "learning_rate": 3.328934258256247e-06, + "loss": 0.80739707, + "num_input_tokens_seen": 51800200, + "step": 2417, + "time_per_iteration": 2.842527389526367 + }, + { + "auxiliary_loss_clip": 0.01138356, + "auxiliary_loss_mlp": 0.01087509, + "balance_loss_clip": 1.03197002, + "balance_loss_mlp": 1.00596929, + "epoch": 0.29074730956532197, + "flos": 24279815174400.0, + "grad_norm": 1.8661991298971203, + "language_loss": 0.67230719, + "learning_rate": 3.3283520183337856e-06, + "loss": 0.69456577, + "num_input_tokens_seen": 51819905, + "step": 2418, + "time_per_iteration": 2.822882890701294 + }, + { + "auxiliary_loss_clip": 0.01121374, + "auxiliary_loss_mlp": 0.0108828, + "balance_loss_clip": 1.02948797, + "balance_loss_mlp": 1.0068841, + "epoch": 0.290867552455961, + "flos": 22340961826560.0, + "grad_norm": 2.1796494184103254, + "language_loss": 0.69198275, + "learning_rate": 3.3277695769013797e-06, + "loss": 0.71407926, + "num_input_tokens_seen": 51839350, + "step": 2419, + "time_per_iteration": 2.767477512359619 + }, + { + "auxiliary_loss_clip": 0.01138983, + "auxiliary_loss_mlp": 0.01089523, + "balance_loss_clip": 1.03207469, + "balance_loss_mlp": 1.00779307, + "epoch": 0.29098779534660013, + "flos": 23186155824000.0, + "grad_norm": 1.9231304686068325, + "language_loss": 0.7773869, + "learning_rate": 3.327186934047385e-06, + "loss": 0.79967201, + "num_input_tokens_seen": 51858045, + "step": 2420, + "time_per_iteration": 2.749220371246338 + }, + { + "auxiliary_loss_clip": 0.01130829, + "auxiliary_loss_mlp": 0.01089743, + "balance_loss_clip": 1.0312624, + "balance_loss_mlp": 1.00829935, + "epoch": 0.29110803823723924, + "flos": 15304194817920.0, + "grad_norm": 2.737901715188147, + "language_loss": 0.65865821, + "learning_rate": 3.3266040898601877e-06, + "loss": 0.68086398, + "num_input_tokens_seen": 51875880, + "step": 2421, + "time_per_iteration": 2.7166972160339355 + }, + { + "auxiliary_loss_clip": 0.01106083, + "auxiliary_loss_mlp": 0.01086983, + "balance_loss_clip": 1.03445721, + "balance_loss_mlp": 1.00577784, + "epoch": 0.2912282811278783, + "flos": 22595352923520.0, + "grad_norm": 1.837921386425914, + "language_loss": 0.77775556, + "learning_rate": 3.3260210444282045e-06, + "loss": 0.79968619, + "num_input_tokens_seen": 51893835, + "step": 2422, + "time_per_iteration": 2.806668758392334 + }, + { + "auxiliary_loss_clip": 0.01131959, + "auxiliary_loss_mlp": 0.01086834, + "balance_loss_clip": 1.02734733, + "balance_loss_mlp": 1.00543785, + "epoch": 0.2913485240185174, + "flos": 24497900599680.0, + "grad_norm": 2.283605659251801, + "language_loss": 0.7328738, + "learning_rate": 3.325437797839883e-06, + "loss": 0.75506175, + "num_input_tokens_seen": 51912205, + "step": 2423, + "time_per_iteration": 2.696340799331665 + }, + { + "auxiliary_loss_clip": 0.01149199, + "auxiliary_loss_mlp": 0.01088975, + "balance_loss_clip": 1.03319883, + "balance_loss_mlp": 1.00743616, + "epoch": 0.2914687669091565, + "flos": 17931024334080.0, + "grad_norm": 2.3388790961071675, + "language_loss": 0.75061196, + "learning_rate": 3.3248543501837015e-06, + "loss": 0.77299368, + "num_input_tokens_seen": 51929410, + "step": 2424, + "time_per_iteration": 2.6620032787323 + }, + { + "auxiliary_loss_clip": 0.01099672, + "auxiliary_loss_mlp": 0.01088637, + "balance_loss_clip": 1.03056407, + "balance_loss_mlp": 1.0071454, + "epoch": 0.2915890097997956, + "flos": 22529313768960.0, + "grad_norm": 1.7212384881810134, + "language_loss": 0.77081931, + "learning_rate": 3.3242707015481684e-06, + "loss": 0.79270238, + "num_input_tokens_seen": 51949345, + "step": 2425, + "time_per_iteration": 2.786320209503174 + }, + { + "auxiliary_loss_clip": 0.01130699, + "auxiliary_loss_mlp": 0.01086835, + "balance_loss_clip": 1.03123522, + "balance_loss_mlp": 1.00553477, + "epoch": 0.2917092526904347, + "flos": 13845216193920.0, + "grad_norm": 1.6652097371094206, + "language_loss": 0.80633378, + "learning_rate": 3.323686852021823e-06, + "loss": 0.82850921, + "num_input_tokens_seen": 51966855, + "step": 2426, + "time_per_iteration": 2.8985085487365723 + }, + { + "auxiliary_loss_clip": 0.01121699, + "auxiliary_loss_mlp": 0.0108809, + "balance_loss_clip": 1.03056049, + "balance_loss_mlp": 1.00674129, + "epoch": 0.2918294955810738, + "flos": 22674859678080.0, + "grad_norm": 2.4393966068401545, + "language_loss": 0.79448128, + "learning_rate": 3.323102801693235e-06, + "loss": 0.8165791, + "num_input_tokens_seen": 51985620, + "step": 2427, + "time_per_iteration": 3.6165785789489746 + }, + { + "auxiliary_loss_clip": 0.01141006, + "auxiliary_loss_mlp": 0.01086747, + "balance_loss_clip": 1.03320062, + "balance_loss_mlp": 1.00525582, + "epoch": 0.29194973847171285, + "flos": 23438284364160.0, + "grad_norm": 2.1990667390715934, + "language_loss": 0.80489671, + "learning_rate": 3.322518550651003e-06, + "loss": 0.82717431, + "num_input_tokens_seen": 52004930, + "step": 2428, + "time_per_iteration": 2.70973539352417 + }, + { + "auxiliary_loss_clip": 0.01132888, + "auxiliary_loss_mlp": 0.01091097, + "balance_loss_clip": 1.03298211, + "balance_loss_mlp": 1.00955784, + "epoch": 0.29206998136235196, + "flos": 21909064694400.0, + "grad_norm": 1.7598108822732517, + "language_loss": 0.80936462, + "learning_rate": 3.3219340989837586e-06, + "loss": 0.83160448, + "num_input_tokens_seen": 52024920, + "step": 2429, + "time_per_iteration": 2.6900267601013184 + }, + { + "auxiliary_loss_clip": 0.01127769, + "auxiliary_loss_mlp": 0.01088997, + "balance_loss_clip": 1.0304184, + "balance_loss_mlp": 1.00764823, + "epoch": 0.292190224252991, + "flos": 23215925220480.0, + "grad_norm": 1.7385009119778059, + "language_loss": 0.80216908, + "learning_rate": 3.3213494467801625e-06, + "loss": 0.82433671, + "num_input_tokens_seen": 52044095, + "step": 2430, + "time_per_iteration": 2.717651605606079 + }, + { + "auxiliary_loss_clip": 0.01084519, + "auxiliary_loss_mlp": 0.0108701, + "balance_loss_clip": 1.03074968, + "balance_loss_mlp": 1.00542331, + "epoch": 0.2923104671436301, + "flos": 20740818752640.0, + "grad_norm": 1.9422978761786331, + "language_loss": 0.71243668, + "learning_rate": 3.3207645941289063e-06, + "loss": 0.73415196, + "num_input_tokens_seen": 52062440, + "step": 2431, + "time_per_iteration": 3.9669055938720703 + }, + { + "auxiliary_loss_clip": 0.01139403, + "auxiliary_loss_mlp": 0.00873946, + "balance_loss_clip": 1.03252137, + "balance_loss_mlp": 1.00022495, + "epoch": 0.29243071003426924, + "flos": 35809114999680.0, + "grad_norm": 2.520764635013049, + "language_loss": 0.80381906, + "learning_rate": 3.320179541118711e-06, + "loss": 0.82395256, + "num_input_tokens_seen": 52084940, + "step": 2432, + "time_per_iteration": 2.768500328063965 + }, + { + "auxiliary_loss_clip": 0.01111286, + "auxiliary_loss_mlp": 0.01080253, + "balance_loss_clip": 1.03308654, + "balance_loss_mlp": 1.00090718, + "epoch": 0.2925509529249083, + "flos": 58081598524800.0, + "grad_norm": 0.9971992708355273, + "language_loss": 0.60329783, + "learning_rate": 3.3195942878383293e-06, + "loss": 0.62521327, + "num_input_tokens_seen": 52141040, + "step": 2433, + "time_per_iteration": 4.200402498245239 + }, + { + "auxiliary_loss_clip": 0.01138872, + "auxiliary_loss_mlp": 0.01087132, + "balance_loss_clip": 1.03174806, + "balance_loss_mlp": 1.00554538, + "epoch": 0.2926711958155474, + "flos": 21397122103680.0, + "grad_norm": 1.7298013642869496, + "language_loss": 0.77917922, + "learning_rate": 3.319008834376543e-06, + "loss": 0.80143929, + "num_input_tokens_seen": 52160730, + "step": 2434, + "time_per_iteration": 3.6094372272491455 + }, + { + "auxiliary_loss_clip": 0.01110553, + "auxiliary_loss_mlp": 0.01088685, + "balance_loss_clip": 1.03080893, + "balance_loss_mlp": 1.00705028, + "epoch": 0.2927914387061865, + "flos": 23185796688000.0, + "grad_norm": 2.934810136174872, + "language_loss": 0.88515496, + "learning_rate": 3.3184231808221654e-06, + "loss": 0.90714741, + "num_input_tokens_seen": 52175055, + "step": 2435, + "time_per_iteration": 2.7475621700286865 + }, + { + "auxiliary_loss_clip": 0.01115751, + "auxiliary_loss_mlp": 0.01087507, + "balance_loss_clip": 1.02762818, + "balance_loss_mlp": 1.00596833, + "epoch": 0.29291168159682557, + "flos": 22455553190400.0, + "grad_norm": 1.8028097800764566, + "language_loss": 0.62567681, + "learning_rate": 3.3178373272640394e-06, + "loss": 0.64770937, + "num_input_tokens_seen": 52194150, + "step": 2436, + "time_per_iteration": 2.7883825302124023 + }, + { + "auxiliary_loss_clip": 0.01148245, + "auxiliary_loss_mlp": 0.0108844, + "balance_loss_clip": 1.03262305, + "balance_loss_mlp": 1.00690079, + "epoch": 0.2930319244874647, + "flos": 21170632896000.0, + "grad_norm": 2.1507379038389742, + "language_loss": 0.84626687, + "learning_rate": 3.3172512737910387e-06, + "loss": 0.86863375, + "num_input_tokens_seen": 52211660, + "step": 2437, + "time_per_iteration": 2.6271140575408936 + }, + { + "auxiliary_loss_clip": 0.01142361, + "auxiliary_loss_mlp": 0.01088579, + "balance_loss_clip": 1.03462553, + "balance_loss_mlp": 1.0071826, + "epoch": 0.2931521673781038, + "flos": 31357843931520.0, + "grad_norm": 2.4770225606150373, + "language_loss": 0.88179696, + "learning_rate": 3.3166650204920674e-06, + "loss": 0.90410638, + "num_input_tokens_seen": 52232830, + "step": 2438, + "time_per_iteration": 2.7668089866638184 + }, + { + "auxiliary_loss_clip": 0.01137411, + "auxiliary_loss_mlp": 0.01088175, + "balance_loss_clip": 1.03083158, + "balance_loss_mlp": 1.00668311, + "epoch": 0.29327241026874284, + "flos": 24200990778240.0, + "grad_norm": 1.6620475941845476, + "language_loss": 0.81513119, + "learning_rate": 3.316078567456059e-06, + "loss": 0.83738708, + "num_input_tokens_seen": 52250670, + "step": 2439, + "time_per_iteration": 2.756826639175415 + }, + { + "auxiliary_loss_clip": 0.01087403, + "auxiliary_loss_mlp": 0.01086136, + "balance_loss_clip": 1.02849197, + "balance_loss_mlp": 1.00469184, + "epoch": 0.29339265315938196, + "flos": 24242611662720.0, + "grad_norm": 1.4669006341546322, + "language_loss": 0.75914109, + "learning_rate": 3.3154919147719786e-06, + "loss": 0.78087652, + "num_input_tokens_seen": 52271685, + "step": 2440, + "time_per_iteration": 2.8806519508361816 + }, + { + "auxiliary_loss_clip": 0.01138316, + "auxiliary_loss_mlp": 0.01087189, + "balance_loss_clip": 1.03126788, + "balance_loss_mlp": 1.00579286, + "epoch": 0.29351289605002107, + "flos": 16946641134720.0, + "grad_norm": 2.0429114998296387, + "language_loss": 0.86455226, + "learning_rate": 3.31490506252882e-06, + "loss": 0.88680726, + "num_input_tokens_seen": 52291065, + "step": 2441, + "time_per_iteration": 2.7414944171905518 + }, + { + "auxiliary_loss_clip": 0.0111412, + "auxiliary_loss_mlp": 0.01086999, + "balance_loss_clip": 1.02988935, + "balance_loss_mlp": 1.00565052, + "epoch": 0.2936331389406601, + "flos": 19829082810240.0, + "grad_norm": 1.787122272218517, + "language_loss": 0.84201229, + "learning_rate": 3.31431801081561e-06, + "loss": 0.86402351, + "num_input_tokens_seen": 52310000, + "step": 2442, + "time_per_iteration": 2.7529568672180176 + }, + { + "auxiliary_loss_clip": 0.01113175, + "auxiliary_loss_mlp": 0.01080063, + "balance_loss_clip": 1.02826989, + "balance_loss_mlp": 1.00033605, + "epoch": 0.29375338183129923, + "flos": 71416844398080.0, + "grad_norm": 0.8943736192945531, + "language_loss": 0.67941892, + "learning_rate": 3.313730759721402e-06, + "loss": 0.70135128, + "num_input_tokens_seen": 52372930, + "step": 2443, + "time_per_iteration": 3.3794283866882324 + }, + { + "auxiliary_loss_clip": 0.01128399, + "auxiliary_loss_mlp": 0.01089826, + "balance_loss_clip": 1.03110838, + "balance_loss_mlp": 1.00838208, + "epoch": 0.29387362472193834, + "flos": 22054502862720.0, + "grad_norm": 1.8988791218802665, + "language_loss": 0.86216879, + "learning_rate": 3.313143309335282e-06, + "loss": 0.88435102, + "num_input_tokens_seen": 52391420, + "step": 2444, + "time_per_iteration": 2.7344272136688232 + }, + { + "auxiliary_loss_clip": 0.01100009, + "auxiliary_loss_mlp": 0.01087207, + "balance_loss_clip": 1.02994859, + "balance_loss_mlp": 1.00566804, + "epoch": 0.2939938676125774, + "flos": 22966418373120.0, + "grad_norm": 1.8162378210735772, + "language_loss": 0.85086048, + "learning_rate": 3.3125556597463665e-06, + "loss": 0.87273264, + "num_input_tokens_seen": 52410725, + "step": 2445, + "time_per_iteration": 2.804774284362793 + }, + { + "auxiliary_loss_clip": 0.01139185, + "auxiliary_loss_mlp": 0.01088518, + "balance_loss_clip": 1.03268325, + "balance_loss_mlp": 1.0071696, + "epoch": 0.2941141105032165, + "flos": 31358705857920.0, + "grad_norm": 1.671017687685126, + "language_loss": 0.66550291, + "learning_rate": 3.311967811043801e-06, + "loss": 0.6877799, + "num_input_tokens_seen": 52432645, + "step": 2446, + "time_per_iteration": 2.8237345218658447 + }, + { + "auxiliary_loss_clip": 0.01139725, + "auxiliary_loss_mlp": 0.01088529, + "balance_loss_clip": 1.03327227, + "balance_loss_mlp": 1.00713348, + "epoch": 0.29423435339385556, + "flos": 23222138273280.0, + "grad_norm": 2.184003681556042, + "language_loss": 0.81178749, + "learning_rate": 3.3113797633167617e-06, + "loss": 0.83407003, + "num_input_tokens_seen": 52450940, + "step": 2447, + "time_per_iteration": 2.6946301460266113 + }, + { + "auxiliary_loss_clip": 0.01148516, + "auxiliary_loss_mlp": 0.01087786, + "balance_loss_clip": 1.03268242, + "balance_loss_mlp": 1.00634229, + "epoch": 0.2943545962844947, + "flos": 26864054138880.0, + "grad_norm": 2.260931036078637, + "language_loss": 0.68713194, + "learning_rate": 3.310791516654455e-06, + "loss": 0.70949495, + "num_input_tokens_seen": 52468000, + "step": 2448, + "time_per_iteration": 2.6877145767211914 + }, + { + "auxiliary_loss_clip": 0.01121754, + "auxiliary_loss_mlp": 0.01088151, + "balance_loss_clip": 1.03104591, + "balance_loss_mlp": 1.0065645, + "epoch": 0.2944748391751338, + "flos": 20231677422720.0, + "grad_norm": 1.8886190860827186, + "language_loss": 0.7949754, + "learning_rate": 3.3102030711461177e-06, + "loss": 0.81707454, + "num_input_tokens_seen": 52487575, + "step": 2449, + "time_per_iteration": 2.748793363571167 + }, + { + "auxiliary_loss_clip": 0.01117566, + "auxiliary_loss_mlp": 0.01088092, + "balance_loss_clip": 1.02879715, + "balance_loss_mlp": 1.00650549, + "epoch": 0.29459508206577284, + "flos": 15960965045760.0, + "grad_norm": 1.705670183389649, + "language_loss": 0.67940688, + "learning_rate": 3.3096144268810156e-06, + "loss": 0.70146346, + "num_input_tokens_seen": 52506335, + "step": 2450, + "time_per_iteration": 2.7806215286254883 + }, + { + "auxiliary_loss_clip": 0.01138806, + "auxiliary_loss_mlp": 0.01087495, + "balance_loss_clip": 1.03169155, + "balance_loss_mlp": 1.00586009, + "epoch": 0.29471532495641195, + "flos": 20412882558720.0, + "grad_norm": 2.1769788904096616, + "language_loss": 0.72818029, + "learning_rate": 3.3090255839484462e-06, + "loss": 0.75044334, + "num_input_tokens_seen": 52524330, + "step": 2451, + "time_per_iteration": 2.6293234825134277 + }, + { + "auxiliary_loss_clip": 0.01121884, + "auxiliary_loss_mlp": 0.0108797, + "balance_loss_clip": 1.0288868, + "balance_loss_mlp": 1.00638318, + "epoch": 0.29483556784705106, + "flos": 20376576887040.0, + "grad_norm": 1.8991853640044238, + "language_loss": 0.85295862, + "learning_rate": 3.3084365424377366e-06, + "loss": 0.8750571, + "num_input_tokens_seen": 52543095, + "step": 2452, + "time_per_iteration": 2.711832284927368 + }, + { + "auxiliary_loss_clip": 0.01087907, + "auxiliary_loss_mlp": 0.0108056, + "balance_loss_clip": 1.02108109, + "balance_loss_mlp": 1.00083303, + "epoch": 0.2949558107376901, + "flos": 68555660595840.0, + "grad_norm": 0.7250073412617347, + "language_loss": 0.55931413, + "learning_rate": 3.307847302438245e-06, + "loss": 0.58099884, + "num_input_tokens_seen": 52597075, + "step": 2453, + "time_per_iteration": 4.200384855270386 + }, + { + "auxiliary_loss_clip": 0.011002, + "auxiliary_loss_mlp": 0.01086185, + "balance_loss_clip": 1.03050065, + "balance_loss_mlp": 1.00469351, + "epoch": 0.2950760536283292, + "flos": 16107085572480.0, + "grad_norm": 2.4666876153139863, + "language_loss": 0.77219254, + "learning_rate": 3.3072578640393562e-06, + "loss": 0.79405642, + "num_input_tokens_seen": 52614410, + "step": 2454, + "time_per_iteration": 2.7335002422332764 + }, + { + "auxiliary_loss_clip": 0.01127543, + "auxiliary_loss_mlp": 0.01088772, + "balance_loss_clip": 1.03053129, + "balance_loss_mlp": 1.00747108, + "epoch": 0.29519629651896834, + "flos": 20483626394880.0, + "grad_norm": 1.9266518376492197, + "language_loss": 0.79636604, + "learning_rate": 3.3066682273304886e-06, + "loss": 0.81852919, + "num_input_tokens_seen": 52632055, + "step": 2455, + "time_per_iteration": 2.7714428901672363 + }, + { + "auxiliary_loss_clip": 0.01140073, + "auxiliary_loss_mlp": 0.00873939, + "balance_loss_clip": 1.03240478, + "balance_loss_mlp": 1.0002563, + "epoch": 0.2953165394096074, + "flos": 18916484941440.0, + "grad_norm": 1.8712671682630018, + "language_loss": 0.78569096, + "learning_rate": 3.3060783924010904e-06, + "loss": 0.80583107, + "num_input_tokens_seen": 52649980, + "step": 2456, + "time_per_iteration": 2.673769474029541 + }, + { + "auxiliary_loss_clip": 0.01115694, + "auxiliary_loss_mlp": 0.01087891, + "balance_loss_clip": 1.02728581, + "balance_loss_mlp": 1.00639939, + "epoch": 0.2954367823002465, + "flos": 20624467622400.0, + "grad_norm": 2.0728396617561513, + "language_loss": 0.84685802, + "learning_rate": 3.3054883593406387e-06, + "loss": 0.86889386, + "num_input_tokens_seen": 52664730, + "step": 2457, + "time_per_iteration": 3.6228675842285156 + }, + { + "auxiliary_loss_clip": 0.01130324, + "auxiliary_loss_mlp": 0.01089789, + "balance_loss_clip": 1.03113925, + "balance_loss_mlp": 1.00844049, + "epoch": 0.2955570251908856, + "flos": 31175525473920.0, + "grad_norm": 2.2274700492134674, + "language_loss": 0.65322655, + "learning_rate": 3.3048981282386404e-06, + "loss": 0.67542768, + "num_input_tokens_seen": 52686040, + "step": 2458, + "time_per_iteration": 3.850351095199585 + }, + { + "auxiliary_loss_clip": 0.01116982, + "auxiliary_loss_mlp": 0.0108837, + "balance_loss_clip": 1.02852845, + "balance_loss_mlp": 1.00697422, + "epoch": 0.29567726808152467, + "flos": 21650328051840.0, + "grad_norm": 2.041671229224764, + "language_loss": 0.82404625, + "learning_rate": 3.304307699184634e-06, + "loss": 0.84609973, + "num_input_tokens_seen": 52704630, + "step": 2459, + "time_per_iteration": 3.899233341217041 + }, + { + "auxiliary_loss_clip": 0.01117192, + "auxiliary_loss_mlp": 0.01089658, + "balance_loss_clip": 1.03215325, + "balance_loss_mlp": 1.00840521, + "epoch": 0.2957975109721638, + "flos": 24243868638720.0, + "grad_norm": 1.6897328072164939, + "language_loss": 0.78863579, + "learning_rate": 3.3037170722681866e-06, + "loss": 0.81070429, + "num_input_tokens_seen": 52725465, + "step": 2460, + "time_per_iteration": 2.8315043449401855 + }, + { + "auxiliary_loss_clip": 0.01118042, + "auxiliary_loss_mlp": 0.01087836, + "balance_loss_clip": 1.02991307, + "balance_loss_mlp": 1.00653565, + "epoch": 0.29591775386280283, + "flos": 13479717352320.0, + "grad_norm": 1.7784181912334331, + "language_loss": 0.68115723, + "learning_rate": 3.3031262475788956e-06, + "loss": 0.70321596, + "num_input_tokens_seen": 52742405, + "step": 2461, + "time_per_iteration": 2.7522687911987305 + }, + { + "auxiliary_loss_clip": 0.0113001, + "auxiliary_loss_mlp": 0.01087874, + "balance_loss_clip": 1.03155565, + "balance_loss_mlp": 1.0065254, + "epoch": 0.29603799675344195, + "flos": 17749783284480.0, + "grad_norm": 1.705641043355668, + "language_loss": 0.73381436, + "learning_rate": 3.3025352252063897e-06, + "loss": 0.75599325, + "num_input_tokens_seen": 52761100, + "step": 2462, + "time_per_iteration": 2.70389723777771 + }, + { + "auxiliary_loss_clip": 0.01136671, + "auxiliary_loss_mlp": 0.01089133, + "balance_loss_clip": 1.03190327, + "balance_loss_mlp": 1.00778472, + "epoch": 0.29615823964408106, + "flos": 22783920347520.0, + "grad_norm": 2.6132187164911067, + "language_loss": 0.74984121, + "learning_rate": 3.3019440052403252e-06, + "loss": 0.77209926, + "num_input_tokens_seen": 52780965, + "step": 2463, + "time_per_iteration": 2.7470052242279053 + }, + { + "auxiliary_loss_clip": 0.01131387, + "auxiliary_loss_mlp": 0.01090427, + "balance_loss_clip": 1.03317022, + "balance_loss_mlp": 1.00903058, + "epoch": 0.2962784825347201, + "flos": 23514199758720.0, + "grad_norm": 1.6798214302882906, + "language_loss": 0.70502377, + "learning_rate": 3.30135258777039e-06, + "loss": 0.72724187, + "num_input_tokens_seen": 52800335, + "step": 2464, + "time_per_iteration": 2.7095751762390137 + }, + { + "auxiliary_loss_clip": 0.01137795, + "auxiliary_loss_mlp": 0.00873828, + "balance_loss_clip": 1.03039265, + "balance_loss_mlp": 1.00028396, + "epoch": 0.2963987254253592, + "flos": 16362769559040.0, + "grad_norm": 1.8827007798870026, + "language_loss": 0.69707394, + "learning_rate": 3.3007609728863024e-06, + "loss": 0.71719015, + "num_input_tokens_seen": 52818425, + "step": 2465, + "time_per_iteration": 2.7019057273864746 + }, + { + "auxiliary_loss_clip": 0.01094218, + "auxiliary_loss_mlp": 0.01087709, + "balance_loss_clip": 1.02627778, + "balance_loss_mlp": 1.00636053, + "epoch": 0.29651896831599833, + "flos": 33472263980160.0, + "grad_norm": 2.3445855878853683, + "language_loss": 0.73281407, + "learning_rate": 3.300169160677809e-06, + "loss": 0.75463331, + "num_input_tokens_seen": 52842340, + "step": 2466, + "time_per_iteration": 2.9764294624328613 + }, + { + "auxiliary_loss_clip": 0.01121118, + "auxiliary_loss_mlp": 0.0108907, + "balance_loss_clip": 1.03119731, + "balance_loss_mlp": 1.00743532, + "epoch": 0.2966392112066374, + "flos": 23805363404160.0, + "grad_norm": 2.6884518268693993, + "language_loss": 0.76943904, + "learning_rate": 3.2995771512346878e-06, + "loss": 0.79154092, + "num_input_tokens_seen": 52860690, + "step": 2467, + "time_per_iteration": 2.7542524337768555 + }, + { + "auxiliary_loss_clip": 0.01149318, + "auxiliary_loss_mlp": 0.0087385, + "balance_loss_clip": 1.03373051, + "balance_loss_mlp": 1.00029993, + "epoch": 0.2967594540972765, + "flos": 19938466702080.0, + "grad_norm": 2.1840280135153876, + "language_loss": 0.73207355, + "learning_rate": 3.298984944646746e-06, + "loss": 0.75230521, + "num_input_tokens_seen": 52879370, + "step": 2468, + "time_per_iteration": 2.699065923690796 + }, + { + "auxiliary_loss_clip": 0.01139962, + "auxiliary_loss_mlp": 0.00873723, + "balance_loss_clip": 1.03299332, + "balance_loss_mlp": 1.00032866, + "epoch": 0.2968796969879156, + "flos": 23732823888000.0, + "grad_norm": 1.8369203704278487, + "language_loss": 0.81567848, + "learning_rate": 3.298392541003822e-06, + "loss": 0.83581537, + "num_input_tokens_seen": 52898775, + "step": 2469, + "time_per_iteration": 2.709299325942993 + }, + { + "auxiliary_loss_clip": 0.01121724, + "auxiliary_loss_mlp": 0.01087955, + "balance_loss_clip": 1.02947104, + "balance_loss_mlp": 1.00670171, + "epoch": 0.29699993987855466, + "flos": 22893699288960.0, + "grad_norm": 1.7667064296850317, + "language_loss": 0.89904761, + "learning_rate": 3.2977999403957806e-06, + "loss": 0.92114437, + "num_input_tokens_seen": 52917535, + "step": 2470, + "time_per_iteration": 2.754546642303467 + }, + { + "auxiliary_loss_clip": 0.01149568, + "auxiliary_loss_mlp": 0.01088443, + "balance_loss_clip": 1.03428936, + "balance_loss_mlp": 1.00704706, + "epoch": 0.2971201827691938, + "flos": 33832555349760.0, + "grad_norm": 2.466444412658954, + "language_loss": 0.67516237, + "learning_rate": 3.2972071429125207e-06, + "loss": 0.69754255, + "num_input_tokens_seen": 52938755, + "step": 2471, + "time_per_iteration": 2.720292806625366 + }, + { + "auxiliary_loss_clip": 0.01116122, + "auxiliary_loss_mlp": 0.01088396, + "balance_loss_clip": 1.02901936, + "balance_loss_mlp": 1.00700045, + "epoch": 0.2972404256598329, + "flos": 22054359208320.0, + "grad_norm": 2.2907872105221494, + "language_loss": 0.88778424, + "learning_rate": 3.2966141486439682e-06, + "loss": 0.9098295, + "num_input_tokens_seen": 52957945, + "step": 2472, + "time_per_iteration": 2.8145854473114014 + }, + { + "auxiliary_loss_clip": 0.01102574, + "auxiliary_loss_mlp": 0.01087166, + "balance_loss_clip": 1.02852559, + "balance_loss_mlp": 1.00577044, + "epoch": 0.29736066855047194, + "flos": 31978595796480.0, + "grad_norm": 8.497303587233715, + "language_loss": 0.63568562, + "learning_rate": 3.29602095768008e-06, + "loss": 0.65758306, + "num_input_tokens_seen": 52978460, + "step": 2473, + "time_per_iteration": 2.931946039199829 + }, + { + "auxiliary_loss_clip": 0.01126771, + "auxiliary_loss_mlp": 0.01088642, + "balance_loss_clip": 1.03067815, + "balance_loss_mlp": 1.00724554, + "epoch": 0.29748091144111105, + "flos": 33510401245440.0, + "grad_norm": 2.121292473955547, + "language_loss": 0.63750452, + "learning_rate": 3.2954275701108437e-06, + "loss": 0.65965861, + "num_input_tokens_seen": 52999640, + "step": 2474, + "time_per_iteration": 2.8883395195007324 + }, + { + "auxiliary_loss_clip": 0.01110128, + "auxiliary_loss_mlp": 0.01087719, + "balance_loss_clip": 1.02879333, + "balance_loss_mlp": 1.00632322, + "epoch": 0.29760115433175016, + "flos": 41283373409280.0, + "grad_norm": 3.7301181629781945, + "language_loss": 0.684484, + "learning_rate": 3.294833986026275e-06, + "loss": 0.70646244, + "num_input_tokens_seen": 53022880, + "step": 2475, + "time_per_iteration": 3.0279483795166016 + }, + { + "auxiliary_loss_clip": 0.01122184, + "auxiliary_loss_mlp": 0.01087328, + "balance_loss_clip": 1.03228569, + "balance_loss_mlp": 1.00588417, + "epoch": 0.2977213972223892, + "flos": 24493339572480.0, + "grad_norm": 1.79411160340669, + "language_loss": 0.84950864, + "learning_rate": 3.29424020551642e-06, + "loss": 0.87160373, + "num_input_tokens_seen": 53041515, + "step": 2476, + "time_per_iteration": 2.7772889137268066 + }, + { + "auxiliary_loss_clip": 0.01150398, + "auxiliary_loss_mlp": 0.01087821, + "balance_loss_clip": 1.03459024, + "balance_loss_mlp": 1.00604355, + "epoch": 0.2978416401130283, + "flos": 21285116519040.0, + "grad_norm": 1.7580143157661186, + "language_loss": 0.72316003, + "learning_rate": 3.2936462286713546e-06, + "loss": 0.74554229, + "num_input_tokens_seen": 53059865, + "step": 2477, + "time_per_iteration": 2.6794750690460205 + }, + { + "auxiliary_loss_clip": 0.0113972, + "auxiliary_loss_mlp": 0.01088992, + "balance_loss_clip": 1.03334332, + "balance_loss_mlp": 1.00745273, + "epoch": 0.2979618830036674, + "flos": 25772154554880.0, + "grad_norm": 2.3204591978841504, + "language_loss": 0.77656424, + "learning_rate": 3.2930520555811846e-06, + "loss": 0.79885137, + "num_input_tokens_seen": 53079490, + "step": 2478, + "time_per_iteration": 3.5989372730255127 + }, + { + "auxiliary_loss_clip": 0.01082728, + "auxiliary_loss_mlp": 0.00873982, + "balance_loss_clip": 1.02238584, + "balance_loss_mlp": 1.00031722, + "epoch": 0.2980821258943065, + "flos": 23476996247040.0, + "grad_norm": 1.8567890111250707, + "language_loss": 0.79929215, + "learning_rate": 3.292457686336046e-06, + "loss": 0.81885922, + "num_input_tokens_seen": 53098810, + "step": 2479, + "time_per_iteration": 2.9374735355377197 + }, + { + "auxiliary_loss_clip": 0.01119869, + "auxiliary_loss_mlp": 0.01081688, + "balance_loss_clip": 1.03565371, + "balance_loss_mlp": 1.0019604, + "epoch": 0.2982023687849456, + "flos": 69752314195200.0, + "grad_norm": 0.9209539696369261, + "language_loss": 0.61259878, + "learning_rate": 3.291863121026105e-06, + "loss": 0.63461435, + "num_input_tokens_seen": 53162590, + "step": 2480, + "time_per_iteration": 3.364877700805664 + }, + { + "auxiliary_loss_clip": 0.01137869, + "auxiliary_loss_mlp": 0.01088056, + "balance_loss_clip": 1.03188252, + "balance_loss_mlp": 1.00656486, + "epoch": 0.29832261167558466, + "flos": 29825930741760.0, + "grad_norm": 2.140411804075601, + "language_loss": 0.76878196, + "learning_rate": 3.2912683597415547e-06, + "loss": 0.7910412, + "num_input_tokens_seen": 53186675, + "step": 2481, + "time_per_iteration": 2.8138790130615234 + }, + { + "auxiliary_loss_clip": 0.01118012, + "auxiliary_loss_mlp": 0.01088038, + "balance_loss_clip": 1.02965391, + "balance_loss_mlp": 1.00664234, + "epoch": 0.29844285456622377, + "flos": 33910158683520.0, + "grad_norm": 3.157545762042041, + "language_loss": 0.78241551, + "learning_rate": 3.2906734025726213e-06, + "loss": 0.80447602, + "num_input_tokens_seen": 53205940, + "step": 2482, + "time_per_iteration": 3.0296285152435303 + }, + { + "auxiliary_loss_clip": 0.01142335, + "auxiliary_loss_mlp": 0.01087422, + "balance_loss_clip": 1.03508389, + "balance_loss_mlp": 1.00602555, + "epoch": 0.2985630974568629, + "flos": 23876933253120.0, + "grad_norm": 1.780711086412213, + "language_loss": 0.87663054, + "learning_rate": 3.290078249609559e-06, + "loss": 0.89892817, + "num_input_tokens_seen": 53225360, + "step": 2483, + "time_per_iteration": 4.769945383071899 + }, + { + "auxiliary_loss_clip": 0.01138804, + "auxiliary_loss_mlp": 0.01088476, + "balance_loss_clip": 1.03257942, + "balance_loss_mlp": 1.00712788, + "epoch": 0.29868334034750194, + "flos": 21799106184960.0, + "grad_norm": 2.0643601936738287, + "language_loss": 0.88666904, + "learning_rate": 3.2894829009426514e-06, + "loss": 0.90894186, + "num_input_tokens_seen": 53243195, + "step": 2484, + "time_per_iteration": 2.6983511447906494 + }, + { + "auxiliary_loss_clip": 0.01137344, + "auxiliary_loss_mlp": 0.01086948, + "balance_loss_clip": 1.03191447, + "balance_loss_mlp": 1.00569463, + "epoch": 0.29880358323814105, + "flos": 25666649331840.0, + "grad_norm": 1.9602366062514782, + "language_loss": 0.77843499, + "learning_rate": 3.288887356662213e-06, + "loss": 0.8006779, + "num_input_tokens_seen": 53264530, + "step": 2485, + "time_per_iteration": 3.662008762359619 + }, + { + "auxiliary_loss_clip": 0.01126629, + "auxiliary_loss_mlp": 0.01082883, + "balance_loss_clip": 1.03505468, + "balance_loss_mlp": 1.00315559, + "epoch": 0.29892382612878016, + "flos": 71005846003200.0, + "grad_norm": 0.7719363341620589, + "language_loss": 0.59714973, + "learning_rate": 3.288291616858588e-06, + "loss": 0.61924487, + "num_input_tokens_seen": 53319920, + "step": 2486, + "time_per_iteration": 3.134220600128174 + }, + { + "auxiliary_loss_clip": 0.01101732, + "auxiliary_loss_mlp": 0.01091127, + "balance_loss_clip": 1.02860045, + "balance_loss_mlp": 1.00973058, + "epoch": 0.2990440690194192, + "flos": 25481134563840.0, + "grad_norm": 17.247390985440315, + "language_loss": 0.76776147, + "learning_rate": 3.287695681622149e-06, + "loss": 0.78969002, + "num_input_tokens_seen": 53339270, + "step": 2487, + "time_per_iteration": 2.8431127071380615 + }, + { + "auxiliary_loss_clip": 0.01130538, + "auxiliary_loss_mlp": 0.01088883, + "balance_loss_clip": 1.03139377, + "balance_loss_mlp": 1.00743961, + "epoch": 0.2991643119100583, + "flos": 23732357011200.0, + "grad_norm": 2.019157240325549, + "language_loss": 0.81055236, + "learning_rate": 3.2870995510432982e-06, + "loss": 0.83274651, + "num_input_tokens_seen": 53357750, + "step": 2488, + "time_per_iteration": 2.820279598236084 + }, + { + "auxiliary_loss_clip": 0.01129482, + "auxiliary_loss_mlp": 0.01087204, + "balance_loss_clip": 1.02986801, + "balance_loss_mlp": 1.00590277, + "epoch": 0.29928455480069743, + "flos": 27417545786880.0, + "grad_norm": 1.8211410038903098, + "language_loss": 0.77317721, + "learning_rate": 3.2865032252124697e-06, + "loss": 0.79534411, + "num_input_tokens_seen": 53378265, + "step": 2489, + "time_per_iteration": 2.711566686630249 + }, + { + "auxiliary_loss_clip": 0.01132855, + "auxiliary_loss_mlp": 0.01088544, + "balance_loss_clip": 1.03295314, + "balance_loss_mlp": 1.00733912, + "epoch": 0.2994047976913365, + "flos": 33692935184640.0, + "grad_norm": 1.4878345129440584, + "language_loss": 0.77562475, + "learning_rate": 3.2859067042201243e-06, + "loss": 0.79783875, + "num_input_tokens_seen": 53400305, + "step": 2490, + "time_per_iteration": 2.916646718978882 + }, + { + "auxiliary_loss_clip": 0.01099437, + "auxiliary_loss_mlp": 0.01087372, + "balance_loss_clip": 1.02924776, + "balance_loss_mlp": 1.00602376, + "epoch": 0.2995250405819756, + "flos": 16763963541120.0, + "grad_norm": 1.7597366211866132, + "language_loss": 0.78253937, + "learning_rate": 3.2853099881567544e-06, + "loss": 0.80440748, + "num_input_tokens_seen": 53418705, + "step": 2491, + "time_per_iteration": 2.796419143676758 + }, + { + "auxiliary_loss_clip": 0.01147375, + "auxiliary_loss_mlp": 0.0108771, + "balance_loss_clip": 1.03272009, + "balance_loss_mlp": 1.00650489, + "epoch": 0.29964528347261465, + "flos": 22963976248320.0, + "grad_norm": 1.823138478627373, + "language_loss": 0.79175043, + "learning_rate": 3.284713077112881e-06, + "loss": 0.81410134, + "num_input_tokens_seen": 53438135, + "step": 2492, + "time_per_iteration": 2.7177135944366455 + }, + { + "auxiliary_loss_clip": 0.01108363, + "auxiliary_loss_mlp": 0.01087688, + "balance_loss_clip": 1.03186631, + "balance_loss_mlp": 1.00619662, + "epoch": 0.29976552636325376, + "flos": 16938021870720.0, + "grad_norm": 2.453429728141505, + "language_loss": 0.86750829, + "learning_rate": 3.284115971179056e-06, + "loss": 0.88946879, + "num_input_tokens_seen": 53452165, + "step": 2493, + "time_per_iteration": 2.7476954460144043 + }, + { + "auxiliary_loss_clip": 0.0108234, + "auxiliary_loss_mlp": 0.01087715, + "balance_loss_clip": 1.02573025, + "balance_loss_mlp": 1.0064621, + "epoch": 0.2998857692538929, + "flos": 17056455989760.0, + "grad_norm": 1.6939750572481374, + "language_loss": 0.78504336, + "learning_rate": 3.283518670445859e-06, + "loss": 0.80674392, + "num_input_tokens_seen": 53470075, + "step": 2494, + "time_per_iteration": 2.8672235012054443 + }, + { + "auxiliary_loss_clip": 0.0110409, + "auxiliary_loss_mlp": 0.00872905, + "balance_loss_clip": 1.03019381, + "balance_loss_mlp": 1.00006986, + "epoch": 0.30000601214453193, + "flos": 68831528025600.0, + "grad_norm": 0.68478584701475, + "language_loss": 0.54339725, + "learning_rate": 3.2829211750038995e-06, + "loss": 0.56316715, + "num_input_tokens_seen": 53538705, + "step": 2495, + "time_per_iteration": 3.4134316444396973 + }, + { + "auxiliary_loss_clip": 0.01110502, + "auxiliary_loss_mlp": 0.01088388, + "balance_loss_clip": 1.02775311, + "balance_loss_mlp": 1.00708699, + "epoch": 0.30012625503517104, + "flos": 17603267708160.0, + "grad_norm": 1.7657646586217635, + "language_loss": 0.89035547, + "learning_rate": 3.2823234849438183e-06, + "loss": 0.9123444, + "num_input_tokens_seen": 53556740, + "step": 2496, + "time_per_iteration": 2.9189813137054443 + }, + { + "auxiliary_loss_clip": 0.01118563, + "auxiliary_loss_mlp": 0.0108706, + "balance_loss_clip": 1.03325796, + "balance_loss_mlp": 1.00580716, + "epoch": 0.30024649792581015, + "flos": 21252581775360.0, + "grad_norm": 1.8772652828020955, + "language_loss": 0.75398821, + "learning_rate": 3.2817256003562836e-06, + "loss": 0.77604443, + "num_input_tokens_seen": 53577115, + "step": 2497, + "time_per_iteration": 2.7531075477600098 + }, + { + "auxiliary_loss_clip": 0.01094623, + "auxiliary_loss_mlp": 0.01087915, + "balance_loss_clip": 1.02796829, + "balance_loss_mlp": 1.00656652, + "epoch": 0.3003667408164492, + "flos": 23003262748800.0, + "grad_norm": 1.7262151184916352, + "language_loss": 0.66257417, + "learning_rate": 3.281127521331995e-06, + "loss": 0.68439955, + "num_input_tokens_seen": 53598295, + "step": 2498, + "time_per_iteration": 2.8952009677886963 + }, + { + "auxiliary_loss_clip": 0.01135674, + "auxiliary_loss_mlp": 0.01080231, + "balance_loss_clip": 1.03550982, + "balance_loss_mlp": 1.00050414, + "epoch": 0.3004869837070883, + "flos": 64232340750720.0, + "grad_norm": 0.8728937938329269, + "language_loss": 0.60688716, + "learning_rate": 3.2805292479616798e-06, + "loss": 0.6290462, + "num_input_tokens_seen": 53657160, + "step": 2499, + "time_per_iteration": 3.1042299270629883 + }, + { + "auxiliary_loss_clip": 0.0113083, + "auxiliary_loss_mlp": 0.01086939, + "balance_loss_clip": 1.03283811, + "balance_loss_mlp": 1.00549507, + "epoch": 0.30060722659772743, + "flos": 26248653400320.0, + "grad_norm": 2.7716529491101545, + "language_loss": 0.92287374, + "learning_rate": 3.2799307803360955e-06, + "loss": 0.94505143, + "num_input_tokens_seen": 53673090, + "step": 2500, + "time_per_iteration": 2.7749991416931152 + }, + { + "auxiliary_loss_clip": 0.01148023, + "auxiliary_loss_mlp": 0.01088534, + "balance_loss_clip": 1.03308344, + "balance_loss_mlp": 1.00728083, + "epoch": 0.3007274694883665, + "flos": 24970879912320.0, + "grad_norm": 1.4158943322261552, + "language_loss": 0.81248212, + "learning_rate": 3.27933211854603e-06, + "loss": 0.83484775, + "num_input_tokens_seen": 53692145, + "step": 2501, + "time_per_iteration": 2.6365411281585693 + }, + { + "auxiliary_loss_clip": 0.01123173, + "auxiliary_loss_mlp": 0.01089069, + "balance_loss_clip": 1.02724266, + "balance_loss_mlp": 1.00767279, + "epoch": 0.3008477123790056, + "flos": 17055845458560.0, + "grad_norm": 1.513503539487657, + "language_loss": 0.87005591, + "learning_rate": 3.278733262682299e-06, + "loss": 0.8921783, + "num_input_tokens_seen": 53710000, + "step": 2502, + "time_per_iteration": 2.756206750869751 + }, + { + "auxiliary_loss_clip": 0.01147668, + "auxiliary_loss_mlp": 0.0108785, + "balance_loss_clip": 1.03207707, + "balance_loss_mlp": 1.00640631, + "epoch": 0.3009679552696447, + "flos": 21506398254720.0, + "grad_norm": 2.511223813340876, + "language_loss": 0.83025146, + "learning_rate": 3.2781342128357484e-06, + "loss": 0.85260659, + "num_input_tokens_seen": 53729355, + "step": 2503, + "time_per_iteration": 2.7116141319274902 + }, + { + "auxiliary_loss_clip": 0.01122786, + "auxiliary_loss_mlp": 0.01088026, + "balance_loss_clip": 1.03285336, + "balance_loss_mlp": 1.00672519, + "epoch": 0.30108819816028376, + "flos": 21134004001920.0, + "grad_norm": 2.3316805847707136, + "language_loss": 0.80280906, + "learning_rate": 3.2775349690972547e-06, + "loss": 0.8249172, + "num_input_tokens_seen": 53743505, + "step": 2504, + "time_per_iteration": 3.6347463130950928 + }, + { + "auxiliary_loss_clip": 0.01124576, + "auxiliary_loss_mlp": 0.01081301, + "balance_loss_clip": 1.03185153, + "balance_loss_mlp": 1.00157404, + "epoch": 0.30120844105092287, + "flos": 71126434938240.0, + "grad_norm": 0.7504331259119711, + "language_loss": 0.51948315, + "learning_rate": 3.276935531557722e-06, + "loss": 0.54154193, + "num_input_tokens_seen": 53808725, + "step": 2505, + "time_per_iteration": 3.369593620300293 + }, + { + "auxiliary_loss_clip": 0.01110405, + "auxiliary_loss_mlp": 0.01087909, + "balance_loss_clip": 1.0298636, + "balance_loss_mlp": 1.00641775, + "epoch": 0.301328683941562, + "flos": 20264571302400.0, + "grad_norm": 2.102940710128932, + "language_loss": 0.79272425, + "learning_rate": 3.2763359003080837e-06, + "loss": 0.81470752, + "num_input_tokens_seen": 53825680, + "step": 2506, + "time_per_iteration": 2.8748116493225098 + }, + { + "auxiliary_loss_clip": 0.01118213, + "auxiliary_loss_mlp": 0.01080558, + "balance_loss_clip": 1.03421736, + "balance_loss_mlp": 1.00083053, + "epoch": 0.30144892683220104, + "flos": 70648212240000.0, + "grad_norm": 0.7969560947788639, + "language_loss": 0.62492716, + "learning_rate": 3.2757360754393047e-06, + "loss": 0.64691484, + "num_input_tokens_seen": 53889750, + "step": 2507, + "time_per_iteration": 3.3729794025421143 + }, + { + "auxiliary_loss_clip": 0.01136922, + "auxiliary_loss_mlp": 0.01087701, + "balance_loss_clip": 1.03109503, + "balance_loss_mlp": 1.00625694, + "epoch": 0.30156916972284015, + "flos": 22820549241600.0, + "grad_norm": 3.856503803359797, + "language_loss": 0.63945717, + "learning_rate": 3.2751360570423767e-06, + "loss": 0.66170335, + "num_input_tokens_seen": 53908135, + "step": 2508, + "time_per_iteration": 3.5998384952545166 + }, + { + "auxiliary_loss_clip": 0.01125437, + "auxiliary_loss_mlp": 0.01086952, + "balance_loss_clip": 1.02919829, + "balance_loss_mlp": 1.00541306, + "epoch": 0.3016894126134792, + "flos": 29899188529920.0, + "grad_norm": 2.1175387195733033, + "language_loss": 0.75882739, + "learning_rate": 3.2745358452083236e-06, + "loss": 0.78095126, + "num_input_tokens_seen": 53931035, + "step": 2509, + "time_per_iteration": 3.7659990787506104 + }, + { + "auxiliary_loss_clip": 0.01137392, + "auxiliary_loss_mlp": 0.01088572, + "balance_loss_clip": 1.03072453, + "balance_loss_mlp": 1.0074625, + "epoch": 0.3018096555041183, + "flos": 21546331200000.0, + "grad_norm": 1.4086489326899636, + "language_loss": 0.82091427, + "learning_rate": 3.2739354400281955e-06, + "loss": 0.84317386, + "num_input_tokens_seen": 53952255, + "step": 2510, + "time_per_iteration": 3.6860930919647217 + }, + { + "auxiliary_loss_clip": 0.01109048, + "auxiliary_loss_mlp": 0.00872995, + "balance_loss_clip": 1.0329845, + "balance_loss_mlp": 1.00006795, + "epoch": 0.3019298983947574, + "flos": 59136294597120.0, + "grad_norm": 0.8647929536966373, + "language_loss": 0.63698882, + "learning_rate": 3.2733348415930744e-06, + "loss": 0.65680921, + "num_input_tokens_seen": 54014125, + "step": 2511, + "time_per_iteration": 3.362661838531494 + }, + { + "auxiliary_loss_clip": 0.01110051, + "auxiliary_loss_mlp": 0.01087509, + "balance_loss_clip": 1.0247972, + "balance_loss_mlp": 1.00620866, + "epoch": 0.3020501412853965, + "flos": 34423070941440.0, + "grad_norm": 2.128172388529663, + "language_loss": 0.80756408, + "learning_rate": 3.27273404999407e-06, + "loss": 0.82953972, + "num_input_tokens_seen": 54036345, + "step": 2512, + "time_per_iteration": 2.8126351833343506 + }, + { + "auxiliary_loss_clip": 0.0111599, + "auxiliary_loss_mlp": 0.01079728, + "balance_loss_clip": 1.03173387, + "balance_loss_mlp": 1.00000095, + "epoch": 0.3021703841760356, + "flos": 71008288128000.0, + "grad_norm": 0.8044891830818083, + "language_loss": 0.60507143, + "learning_rate": 3.272133065322322e-06, + "loss": 0.62702858, + "num_input_tokens_seen": 54094615, + "step": 2513, + "time_per_iteration": 3.280686855316162 + }, + { + "auxiliary_loss_clip": 0.01146283, + "auxiliary_loss_mlp": 0.01088605, + "balance_loss_clip": 1.03117967, + "balance_loss_mlp": 1.0073998, + "epoch": 0.3022906270666747, + "flos": 21510528318720.0, + "grad_norm": 1.6109815563014167, + "language_loss": 0.79617929, + "learning_rate": 3.271531887669e-06, + "loss": 0.81852818, + "num_input_tokens_seen": 54114675, + "step": 2514, + "time_per_iteration": 2.6721556186676025 + }, + { + "auxiliary_loss_clip": 0.01109838, + "auxiliary_loss_mlp": 0.01088098, + "balance_loss_clip": 1.02826381, + "balance_loss_mlp": 1.00665438, + "epoch": 0.30241086995731375, + "flos": 31132001168640.0, + "grad_norm": 2.187054230800702, + "language_loss": 0.63400948, + "learning_rate": 3.2709305171253015e-06, + "loss": 0.65598881, + "num_input_tokens_seen": 54134795, + "step": 2515, + "time_per_iteration": 2.915491819381714 + }, + { + "auxiliary_loss_clip": 0.01137366, + "auxiliary_loss_mlp": 0.01088566, + "balance_loss_clip": 1.03153443, + "balance_loss_mlp": 1.00707436, + "epoch": 0.30253111284795287, + "flos": 23511542152320.0, + "grad_norm": 1.7119439470187978, + "language_loss": 0.7742933, + "learning_rate": 3.2703289537824536e-06, + "loss": 0.79655266, + "num_input_tokens_seen": 54154595, + "step": 2516, + "time_per_iteration": 2.8005199432373047 + }, + { + "auxiliary_loss_clip": 0.01111103, + "auxiliary_loss_mlp": 0.01087787, + "balance_loss_clip": 1.02982748, + "balance_loss_mlp": 1.00639057, + "epoch": 0.302651355738592, + "flos": 18725367651840.0, + "grad_norm": 3.0159185037565566, + "language_loss": 0.78555536, + "learning_rate": 3.269727197731714e-06, + "loss": 0.80754429, + "num_input_tokens_seen": 54167360, + "step": 2517, + "time_per_iteration": 2.7343151569366455 + }, + { + "auxiliary_loss_clip": 0.01102253, + "auxiliary_loss_mlp": 0.01086868, + "balance_loss_clip": 1.0254395, + "balance_loss_mlp": 1.005615, + "epoch": 0.30277159862923103, + "flos": 22418888382720.0, + "grad_norm": 1.5914756252381332, + "language_loss": 0.77913505, + "learning_rate": 3.269125249064367e-06, + "loss": 0.80102623, + "num_input_tokens_seen": 54187055, + "step": 2518, + "time_per_iteration": 2.8262157440185547 + }, + { + "auxiliary_loss_clip": 0.01147808, + "auxiliary_loss_mlp": 0.01088214, + "balance_loss_clip": 1.03185463, + "balance_loss_mlp": 1.00681841, + "epoch": 0.30289184151987014, + "flos": 22273126992000.0, + "grad_norm": 1.8739958301813706, + "language_loss": 0.83340657, + "learning_rate": 3.2685231078717297e-06, + "loss": 0.85576677, + "num_input_tokens_seen": 54207245, + "step": 2519, + "time_per_iteration": 2.6330583095550537 + }, + { + "auxiliary_loss_clip": 0.01115357, + "auxiliary_loss_mlp": 0.00873836, + "balance_loss_clip": 1.02843475, + "balance_loss_mlp": 1.0002389, + "epoch": 0.30301208441050925, + "flos": 25225594231680.0, + "grad_norm": 2.105076694369315, + "language_loss": 0.75801301, + "learning_rate": 3.267920774245145e-06, + "loss": 0.77790493, + "num_input_tokens_seen": 54226650, + "step": 2520, + "time_per_iteration": 2.8471693992614746 + }, + { + "auxiliary_loss_clip": 0.01138893, + "auxiliary_loss_mlp": 0.01085933, + "balance_loss_clip": 1.03286433, + "balance_loss_mlp": 1.00448883, + "epoch": 0.3031323273011483, + "flos": 23039245198080.0, + "grad_norm": 2.2752496888784393, + "language_loss": 0.8429563, + "learning_rate": 3.2673182482759876e-06, + "loss": 0.86520457, + "num_input_tokens_seen": 54245765, + "step": 2521, + "time_per_iteration": 2.736926794052124 + }, + { + "auxiliary_loss_clip": 0.01136543, + "auxiliary_loss_mlp": 0.01089213, + "balance_loss_clip": 1.03127873, + "balance_loss_mlp": 1.00776887, + "epoch": 0.3032525701917874, + "flos": 18876695650560.0, + "grad_norm": 6.092920306238431, + "language_loss": 0.65583277, + "learning_rate": 3.266715530055659e-06, + "loss": 0.67809033, + "num_input_tokens_seen": 54263915, + "step": 2522, + "time_per_iteration": 2.714721441268921 + }, + { + "auxiliary_loss_clip": 0.01138272, + "auxiliary_loss_mlp": 0.01087406, + "balance_loss_clip": 1.03081894, + "balance_loss_mlp": 1.00591421, + "epoch": 0.30337281308242653, + "flos": 17782641250560.0, + "grad_norm": 1.5811553524028839, + "language_loss": 0.80345935, + "learning_rate": 3.2661126196755927e-06, + "loss": 0.82571614, + "num_input_tokens_seen": 54283025, + "step": 2523, + "time_per_iteration": 2.756345510482788 + }, + { + "auxiliary_loss_clip": 0.01130785, + "auxiliary_loss_mlp": 0.01079406, + "balance_loss_clip": 1.03062081, + "balance_loss_mlp": 1.00006032, + "epoch": 0.3034930559730656, + "flos": 57824298426240.0, + "grad_norm": 0.7747286711913516, + "language_loss": 0.55984259, + "learning_rate": 3.265509517227248e-06, + "loss": 0.58194453, + "num_input_tokens_seen": 54339840, + "step": 2524, + "time_per_iteration": 3.242868185043335 + }, + { + "auxiliary_loss_clip": 0.01129288, + "auxiliary_loss_mlp": 0.01088908, + "balance_loss_clip": 1.03113413, + "balance_loss_mlp": 1.00760746, + "epoch": 0.3036132988637047, + "flos": 14755587419520.0, + "grad_norm": 3.2273570862122516, + "language_loss": 0.80955249, + "learning_rate": 3.264906222802115e-06, + "loss": 0.83173442, + "num_input_tokens_seen": 54357690, + "step": 2525, + "time_per_iteration": 2.754063367843628 + }, + { + "auxiliary_loss_clip": 0.01147859, + "auxiliary_loss_mlp": 0.01086253, + "balance_loss_clip": 1.03216422, + "balance_loss_mlp": 1.00485742, + "epoch": 0.30373354175434375, + "flos": 21033203460480.0, + "grad_norm": 2.0067689427479936, + "language_loss": 0.78072429, + "learning_rate": 3.264302736491715e-06, + "loss": 0.80306536, + "num_input_tokens_seen": 54377810, + "step": 2526, + "time_per_iteration": 2.6683778762817383 + }, + { + "auxiliary_loss_clip": 0.01133093, + "auxiliary_loss_mlp": 0.01088356, + "balance_loss_clip": 1.02977026, + "balance_loss_mlp": 1.00700772, + "epoch": 0.30385378464498286, + "flos": 21143233797120.0, + "grad_norm": 1.7441810720472055, + "language_loss": 0.871351, + "learning_rate": 3.263699058387594e-06, + "loss": 0.89356554, + "num_input_tokens_seen": 54395245, + "step": 2527, + "time_per_iteration": 2.729856252670288 + }, + { + "auxiliary_loss_clip": 0.01122717, + "auxiliary_loss_mlp": 0.01088229, + "balance_loss_clip": 1.03142154, + "balance_loss_mlp": 1.00692868, + "epoch": 0.30397402753562197, + "flos": 20629244131200.0, + "grad_norm": 2.2948894202923813, + "language_loss": 0.90582955, + "learning_rate": 3.2630951885813315e-06, + "loss": 0.92793894, + "num_input_tokens_seen": 54412640, + "step": 2528, + "time_per_iteration": 2.727071523666382 + }, + { + "auxiliary_loss_clip": 0.0112849, + "auxiliary_loss_mlp": 0.01087129, + "balance_loss_clip": 1.02970028, + "balance_loss_mlp": 1.00582862, + "epoch": 0.304094270426261, + "flos": 15085678429440.0, + "grad_norm": 1.8519334323891152, + "language_loss": 0.78328663, + "learning_rate": 3.262491127164533e-06, + "loss": 0.80544281, + "num_input_tokens_seen": 54431455, + "step": 2529, + "time_per_iteration": 3.6692020893096924 + }, + { + "auxiliary_loss_clip": 0.01116636, + "auxiliary_loss_mlp": 0.00873748, + "balance_loss_clip": 1.03123784, + "balance_loss_mlp": 1.00033402, + "epoch": 0.30421451331690014, + "flos": 13845216193920.0, + "grad_norm": 3.776812726200105, + "language_loss": 0.80434424, + "learning_rate": 3.2618868742288337e-06, + "loss": 0.82424808, + "num_input_tokens_seen": 54448380, + "step": 2530, + "time_per_iteration": 2.7278146743774414 + }, + { + "auxiliary_loss_clip": 0.01139237, + "auxiliary_loss_mlp": 0.01088266, + "balance_loss_clip": 1.03297901, + "balance_loss_mlp": 1.00682211, + "epoch": 0.30433475620753925, + "flos": 17384212615680.0, + "grad_norm": 1.6766451839091405, + "language_loss": 0.72299087, + "learning_rate": 3.261282429865899e-06, + "loss": 0.74526584, + "num_input_tokens_seen": 54466385, + "step": 2531, + "time_per_iteration": 2.744130849838257 + }, + { + "auxiliary_loss_clip": 0.01132817, + "auxiliary_loss_mlp": 0.00873659, + "balance_loss_clip": 1.03419256, + "balance_loss_mlp": 1.00019979, + "epoch": 0.3044549990981783, + "flos": 18916951818240.0, + "grad_norm": 1.6696211848169487, + "language_loss": 0.7230159, + "learning_rate": 3.2606777941674225e-06, + "loss": 0.74308068, + "num_input_tokens_seen": 54485040, + "step": 2532, + "time_per_iteration": 2.711763381958008 + }, + { + "auxiliary_loss_clip": 0.0109592, + "auxiliary_loss_mlp": 0.01087206, + "balance_loss_clip": 1.02446318, + "balance_loss_mlp": 1.00585747, + "epoch": 0.3045752419888174, + "flos": 21068431724160.0, + "grad_norm": 1.9949045096915454, + "language_loss": 0.84541667, + "learning_rate": 3.2600729672251276e-06, + "loss": 0.86724794, + "num_input_tokens_seen": 54502755, + "step": 2533, + "time_per_iteration": 3.7075982093811035 + }, + { + "auxiliary_loss_clip": 0.0114868, + "auxiliary_loss_mlp": 0.00873759, + "balance_loss_clip": 1.03370571, + "balance_loss_mlp": 1.00029969, + "epoch": 0.3046954848794565, + "flos": 29096405516160.0, + "grad_norm": 1.822158018658192, + "language_loss": 0.65150583, + "learning_rate": 3.259467949130765e-06, + "loss": 0.67173022, + "num_input_tokens_seen": 54524165, + "step": 2534, + "time_per_iteration": 3.682684898376465 + }, + { + "auxiliary_loss_clip": 0.01127534, + "auxiliary_loss_mlp": 0.01088499, + "balance_loss_clip": 1.03071654, + "balance_loss_mlp": 1.00710332, + "epoch": 0.3048157277700956, + "flos": 20295346279680.0, + "grad_norm": 2.0344949641723256, + "language_loss": 0.82844841, + "learning_rate": 3.2588627399761164e-06, + "loss": 0.85060871, + "num_input_tokens_seen": 54540160, + "step": 2535, + "time_per_iteration": 2.7274856567382812 + }, + { + "auxiliary_loss_clip": 0.01128747, + "auxiliary_loss_mlp": 0.01086321, + "balance_loss_clip": 1.03226089, + "balance_loss_mlp": 1.00516379, + "epoch": 0.3049359706607347, + "flos": 22739929165440.0, + "grad_norm": 1.579418232162739, + "language_loss": 0.70772111, + "learning_rate": 3.2582573398529903e-06, + "loss": 0.72987175, + "num_input_tokens_seen": 54557515, + "step": 2536, + "time_per_iteration": 3.630661725997925 + }, + { + "auxiliary_loss_clip": 0.01124477, + "auxiliary_loss_mlp": 0.01088096, + "balance_loss_clip": 1.03270614, + "balance_loss_mlp": 1.00665236, + "epoch": 0.3050562135513738, + "flos": 18434634969600.0, + "grad_norm": 4.862423412844595, + "language_loss": 0.73480469, + "learning_rate": 3.2576517488532265e-06, + "loss": 0.75693041, + "num_input_tokens_seen": 54573865, + "step": 2537, + "time_per_iteration": 2.7815630435943604 + }, + { + "auxiliary_loss_clip": 0.01140374, + "auxiliary_loss_mlp": 0.01088235, + "balance_loss_clip": 1.03331816, + "balance_loss_mlp": 1.00707746, + "epoch": 0.30517645644201286, + "flos": 20370327920640.0, + "grad_norm": 1.7893152189543253, + "language_loss": 0.87328255, + "learning_rate": 3.257045967068692e-06, + "loss": 0.89556867, + "num_input_tokens_seen": 54593120, + "step": 2538, + "time_per_iteration": 2.795081615447998 + }, + { + "auxiliary_loss_clip": 0.01149715, + "auxiliary_loss_mlp": 0.01089226, + "balance_loss_clip": 1.03395915, + "balance_loss_mlp": 1.00778258, + "epoch": 0.30529669933265197, + "flos": 21945118970880.0, + "grad_norm": 1.574640386022061, + "language_loss": 0.82017487, + "learning_rate": 3.2564399945912848e-06, + "loss": 0.84256423, + "num_input_tokens_seen": 54612910, + "step": 2539, + "time_per_iteration": 2.6259219646453857 + }, + { + "auxiliary_loss_clip": 0.01102707, + "auxiliary_loss_mlp": 0.01088832, + "balance_loss_clip": 1.03347087, + "balance_loss_mlp": 1.00776947, + "epoch": 0.305416942223291, + "flos": 21835411856640.0, + "grad_norm": 2.05721747883043, + "language_loss": 0.81969267, + "learning_rate": 3.2558338315129287e-06, + "loss": 0.84160805, + "num_input_tokens_seen": 54631055, + "step": 2540, + "time_per_iteration": 2.9417812824249268 + }, + { + "auxiliary_loss_clip": 0.01142169, + "auxiliary_loss_mlp": 0.01086749, + "balance_loss_clip": 1.03494906, + "balance_loss_mlp": 1.00549555, + "epoch": 0.30553718511393013, + "flos": 33911810709120.0, + "grad_norm": 1.7348268762158463, + "language_loss": 0.76237369, + "learning_rate": 3.2552274779255785e-06, + "loss": 0.78466284, + "num_input_tokens_seen": 54651985, + "step": 2541, + "time_per_iteration": 2.7672340869903564 + }, + { + "auxiliary_loss_clip": 0.01136179, + "auxiliary_loss_mlp": 0.01086408, + "balance_loss_clip": 1.03039217, + "balance_loss_mlp": 1.00505948, + "epoch": 0.30565742800456924, + "flos": 22268530051200.0, + "grad_norm": 2.2570621236202153, + "language_loss": 0.76516581, + "learning_rate": 3.2546209339212184e-06, + "loss": 0.78739166, + "num_input_tokens_seen": 54671005, + "step": 2542, + "time_per_iteration": 2.6919796466827393 + }, + { + "auxiliary_loss_clip": 0.01131645, + "auxiliary_loss_mlp": 0.01087162, + "balance_loss_clip": 1.0328145, + "balance_loss_mlp": 1.00590909, + "epoch": 0.3057776708952083, + "flos": 22565044823040.0, + "grad_norm": 1.5166424838126547, + "language_loss": 0.77614015, + "learning_rate": 3.25401419959186e-06, + "loss": 0.79832816, + "num_input_tokens_seen": 54691615, + "step": 2543, + "time_per_iteration": 2.700660467147827 + }, + { + "auxiliary_loss_clip": 0.01134888, + "auxiliary_loss_mlp": 0.01089561, + "balance_loss_clip": 1.03537619, + "balance_loss_mlp": 1.00826049, + "epoch": 0.3058979137858474, + "flos": 21799213925760.0, + "grad_norm": 1.7182822114161727, + "language_loss": 0.76239157, + "learning_rate": 3.253407275029545e-06, + "loss": 0.78463608, + "num_input_tokens_seen": 54710520, + "step": 2544, + "time_per_iteration": 2.7667922973632812 + }, + { + "auxiliary_loss_clip": 0.01118093, + "auxiliary_loss_mlp": 0.0108896, + "balance_loss_clip": 1.02981997, + "balance_loss_mlp": 1.00737286, + "epoch": 0.3060181566764865, + "flos": 26979435601920.0, + "grad_norm": 2.3085806470863184, + "language_loss": 0.80195081, + "learning_rate": 3.2528001603263425e-06, + "loss": 0.8240214, + "num_input_tokens_seen": 54732590, + "step": 2545, + "time_per_iteration": 2.815713882446289 + }, + { + "auxiliary_loss_clip": 0.01134662, + "auxiliary_loss_mlp": 0.01088959, + "balance_loss_clip": 1.03053164, + "balance_loss_mlp": 1.00751543, + "epoch": 0.3061383995671256, + "flos": 19865101173120.0, + "grad_norm": 1.7707560261844388, + "language_loss": 0.81377226, + "learning_rate": 3.2521928555743514e-06, + "loss": 0.83600849, + "num_input_tokens_seen": 54749935, + "step": 2546, + "time_per_iteration": 2.658318281173706 + }, + { + "auxiliary_loss_clip": 0.0112995, + "auxiliary_loss_mlp": 0.00873846, + "balance_loss_clip": 1.03174329, + "balance_loss_mlp": 1.00039005, + "epoch": 0.3062586424577647, + "flos": 22127509255680.0, + "grad_norm": 1.790918456154221, + "language_loss": 0.67674506, + "learning_rate": 3.2515853608657e-06, + "loss": 0.69678301, + "num_input_tokens_seen": 54767935, + "step": 2547, + "time_per_iteration": 2.7596383094787598 + }, + { + "auxiliary_loss_clip": 0.01141269, + "auxiliary_loss_mlp": 0.01088734, + "balance_loss_clip": 1.03381205, + "balance_loss_mlp": 1.00733817, + "epoch": 0.3063788853484038, + "flos": 20845497962880.0, + "grad_norm": 2.1730605841719877, + "language_loss": 0.74421108, + "learning_rate": 3.250977676292545e-06, + "loss": 0.76651108, + "num_input_tokens_seen": 54786175, + "step": 2548, + "time_per_iteration": 2.6679511070251465 + }, + { + "auxiliary_loss_clip": 0.01128354, + "auxiliary_loss_mlp": 0.01088087, + "balance_loss_clip": 1.03106678, + "balance_loss_mlp": 1.00673854, + "epoch": 0.30649912823904285, + "flos": 16209717707520.0, + "grad_norm": 2.447615120658927, + "language_loss": 0.79212946, + "learning_rate": 3.2503698019470712e-06, + "loss": 0.81429386, + "num_input_tokens_seen": 54801945, + "step": 2549, + "time_per_iteration": 2.7765257358551025 + }, + { + "auxiliary_loss_clip": 0.01138734, + "auxiliary_loss_mlp": 0.01087524, + "balance_loss_clip": 1.0319699, + "balance_loss_mlp": 1.00603223, + "epoch": 0.30661937112968196, + "flos": 18617815353600.0, + "grad_norm": 2.2772969731199764, + "language_loss": 0.7800076, + "learning_rate": 3.249761737921492e-06, + "loss": 0.80227017, + "num_input_tokens_seen": 54818475, + "step": 2550, + "time_per_iteration": 2.739349842071533 + }, + { + "auxiliary_loss_clip": 0.0112862, + "auxiliary_loss_mlp": 0.01088046, + "balance_loss_clip": 1.03194356, + "balance_loss_mlp": 1.00679302, + "epoch": 0.30673961402032107, + "flos": 31390809638400.0, + "grad_norm": 2.0978631869786324, + "language_loss": 0.74521291, + "learning_rate": 3.249153484308051e-06, + "loss": 0.76737958, + "num_input_tokens_seen": 54837090, + "step": 2551, + "time_per_iteration": 2.825279474258423 + }, + { + "auxiliary_loss_clip": 0.01106806, + "auxiliary_loss_mlp": 0.01086268, + "balance_loss_clip": 1.02868342, + "balance_loss_mlp": 1.00477624, + "epoch": 0.3068598569109601, + "flos": 20229809915520.0, + "grad_norm": 1.740178746997848, + "language_loss": 0.7772128, + "learning_rate": 3.2485450411990194e-06, + "loss": 0.79914349, + "num_input_tokens_seen": 54856445, + "step": 2552, + "time_per_iteration": 2.781062126159668 + }, + { + "auxiliary_loss_clip": 0.01148232, + "auxiliary_loss_mlp": 0.0108729, + "balance_loss_clip": 1.03277338, + "balance_loss_mlp": 1.00570357, + "epoch": 0.30698009980159924, + "flos": 29601991399680.0, + "grad_norm": 1.9027884380898996, + "language_loss": 0.82365203, + "learning_rate": 3.2479364086866983e-06, + "loss": 0.84600723, + "num_input_tokens_seen": 54876700, + "step": 2553, + "time_per_iteration": 2.6848654747009277 + }, + { + "auxiliary_loss_clip": 0.01126056, + "auxiliary_loss_mlp": 0.00873897, + "balance_loss_clip": 1.03012455, + "balance_loss_mlp": 1.00043726, + "epoch": 0.30710034269223835, + "flos": 23842423261440.0, + "grad_norm": 1.8677725755849186, + "language_loss": 0.81618714, + "learning_rate": 3.247327586863416e-06, + "loss": 0.83618665, + "num_input_tokens_seen": 54897580, + "step": 2554, + "time_per_iteration": 3.7155017852783203 + }, + { + "auxiliary_loss_clip": 0.01119216, + "auxiliary_loss_mlp": 0.01087567, + "balance_loss_clip": 1.02970648, + "balance_loss_mlp": 1.00617123, + "epoch": 0.3072205855828774, + "flos": 25884986152320.0, + "grad_norm": 1.8854954574432974, + "language_loss": 0.76867568, + "learning_rate": 3.2467185758215304e-06, + "loss": 0.79074347, + "num_input_tokens_seen": 54917320, + "step": 2555, + "time_per_iteration": 2.807769298553467 + }, + { + "auxiliary_loss_clip": 0.01102441, + "auxiliary_loss_mlp": 0.00873755, + "balance_loss_clip": 1.02822065, + "balance_loss_mlp": 1.00046492, + "epoch": 0.3073408284735165, + "flos": 22236390357120.0, + "grad_norm": 2.983720705358667, + "language_loss": 0.85647458, + "learning_rate": 3.246109375653428e-06, + "loss": 0.87623656, + "num_input_tokens_seen": 54934085, + "step": 2556, + "time_per_iteration": 2.787405252456665 + }, + { + "auxiliary_loss_clip": 0.01149499, + "auxiliary_loss_mlp": 0.01087215, + "balance_loss_clip": 1.03409147, + "balance_loss_mlp": 1.00591445, + "epoch": 0.30746107136415557, + "flos": 19500284689920.0, + "grad_norm": 1.7032333744817676, + "language_loss": 0.78184062, + "learning_rate": 3.2454999864515243e-06, + "loss": 0.80420774, + "num_input_tokens_seen": 54953460, + "step": 2557, + "time_per_iteration": 2.636061906814575 + }, + { + "auxiliary_loss_clip": 0.01117415, + "auxiliary_loss_mlp": 0.0087385, + "balance_loss_clip": 1.02708423, + "balance_loss_mlp": 1.00046587, + "epoch": 0.3075813142547947, + "flos": 21724806902400.0, + "grad_norm": 1.7748503251081724, + "language_loss": 0.69572586, + "learning_rate": 3.244890408308263e-06, + "loss": 0.71563852, + "num_input_tokens_seen": 54974165, + "step": 2558, + "time_per_iteration": 3.707824230194092 + }, + { + "auxiliary_loss_clip": 0.01108073, + "auxiliary_loss_mlp": 0.01086956, + "balance_loss_clip": 1.02821994, + "balance_loss_mlp": 1.00565505, + "epoch": 0.3077015571454338, + "flos": 24097963593600.0, + "grad_norm": 1.841357138337897, + "language_loss": 0.60971713, + "learning_rate": 3.2442806413161165e-06, + "loss": 0.63166738, + "num_input_tokens_seen": 54993810, + "step": 2559, + "time_per_iteration": 2.936692953109741 + }, + { + "auxiliary_loss_clip": 0.01090368, + "auxiliary_loss_mlp": 0.01088716, + "balance_loss_clip": 1.02854633, + "balance_loss_mlp": 1.00732017, + "epoch": 0.30782180003607285, + "flos": 18405476104320.0, + "grad_norm": 1.8677288712851576, + "language_loss": 0.75858605, + "learning_rate": 3.243670685567586e-06, + "loss": 0.78037691, + "num_input_tokens_seen": 55011210, + "step": 2560, + "time_per_iteration": 3.7746310234069824 + }, + { + "auxiliary_loss_clip": 0.01129591, + "auxiliary_loss_mlp": 0.00873703, + "balance_loss_clip": 1.03244066, + "balance_loss_mlp": 1.00050521, + "epoch": 0.30794204292671196, + "flos": 23878549365120.0, + "grad_norm": 1.983993845107894, + "language_loss": 0.80014592, + "learning_rate": 3.2430605411552012e-06, + "loss": 0.82017887, + "num_input_tokens_seen": 55031325, + "step": 2561, + "time_per_iteration": 2.778526544570923 + }, + { + "auxiliary_loss_clip": 0.01110185, + "auxiliary_loss_mlp": 0.01079782, + "balance_loss_clip": 1.03414989, + "balance_loss_mlp": 1.0000546, + "epoch": 0.30806228581735107, + "flos": 67927800816000.0, + "grad_norm": 0.8944885220792478, + "language_loss": 0.70598042, + "learning_rate": 3.2424502081715205e-06, + "loss": 0.72788012, + "num_input_tokens_seen": 55094440, + "step": 2562, + "time_per_iteration": 4.286638498306274 + }, + { + "auxiliary_loss_clip": 0.01117228, + "auxiliary_loss_mlp": 0.01088284, + "balance_loss_clip": 1.03263283, + "balance_loss_mlp": 1.00693572, + "epoch": 0.3081825287079901, + "flos": 23843213360640.0, + "grad_norm": 1.8094898564067745, + "language_loss": 0.7758323, + "learning_rate": 3.241839686709132e-06, + "loss": 0.79788733, + "num_input_tokens_seen": 55115375, + "step": 2563, + "time_per_iteration": 2.744502305984497 + }, + { + "auxiliary_loss_clip": 0.01140669, + "auxiliary_loss_mlp": 0.01089818, + "balance_loss_clip": 1.03338003, + "balance_loss_mlp": 1.00827861, + "epoch": 0.30830277159862923, + "flos": 16209969102720.0, + "grad_norm": 3.6638260807879686, + "language_loss": 0.82177126, + "learning_rate": 3.2412289768606495e-06, + "loss": 0.84407616, + "num_input_tokens_seen": 55131945, + "step": 2564, + "time_per_iteration": 2.827322244644165 + }, + { + "auxiliary_loss_clip": 0.01141561, + "auxiliary_loss_mlp": 0.01089414, + "balance_loss_clip": 1.03476012, + "balance_loss_mlp": 1.00801754, + "epoch": 0.30842301448926834, + "flos": 29349503723520.0, + "grad_norm": 1.8005647898491808, + "language_loss": 0.82469296, + "learning_rate": 3.240618078718718e-06, + "loss": 0.84700274, + "num_input_tokens_seen": 55153405, + "step": 2565, + "time_per_iteration": 2.7532103061676025 + }, + { + "auxiliary_loss_clip": 0.01117499, + "auxiliary_loss_mlp": 0.01086698, + "balance_loss_clip": 1.02893257, + "balance_loss_mlp": 1.00525367, + "epoch": 0.3085432573799074, + "flos": 21945190798080.0, + "grad_norm": 1.925046433242233, + "language_loss": 0.74273503, + "learning_rate": 3.240006992376011e-06, + "loss": 0.76477695, + "num_input_tokens_seen": 55173030, + "step": 2566, + "time_per_iteration": 2.788938045501709 + }, + { + "auxiliary_loss_clip": 0.01132036, + "auxiliary_loss_mlp": 0.01088331, + "balance_loss_clip": 1.0335747, + "balance_loss_mlp": 1.00703001, + "epoch": 0.3086635002705465, + "flos": 22054718344320.0, + "grad_norm": 2.1404979659786654, + "language_loss": 0.76211166, + "learning_rate": 3.2393957179252284e-06, + "loss": 0.78431535, + "num_input_tokens_seen": 55189565, + "step": 2567, + "time_per_iteration": 2.712357759475708 + }, + { + "auxiliary_loss_clip": 0.01150365, + "auxiliary_loss_mlp": 0.01087862, + "balance_loss_clip": 1.03509152, + "balance_loss_mlp": 1.00646639, + "epoch": 0.3087837431611856, + "flos": 32665925520000.0, + "grad_norm": 1.758649120405134, + "language_loss": 0.80780363, + "learning_rate": 3.2387842554591016e-06, + "loss": 0.83018589, + "num_input_tokens_seen": 55210380, + "step": 2568, + "time_per_iteration": 2.7352306842803955 + }, + { + "auxiliary_loss_clip": 0.01149618, + "auxiliary_loss_mlp": 0.01089409, + "balance_loss_clip": 1.03470397, + "balance_loss_mlp": 1.00786948, + "epoch": 0.3089039860518247, + "flos": 17599245384960.0, + "grad_norm": 2.1363853935805763, + "language_loss": 0.87820733, + "learning_rate": 3.238172605070388e-06, + "loss": 0.90059757, + "num_input_tokens_seen": 55225795, + "step": 2569, + "time_per_iteration": 2.689682722091675 + }, + { + "auxiliary_loss_clip": 0.01131071, + "auxiliary_loss_mlp": 0.0087394, + "balance_loss_clip": 1.03048325, + "balance_loss_mlp": 1.0005219, + "epoch": 0.3090242289424638, + "flos": 14383839611520.0, + "grad_norm": 2.0953687520568924, + "language_loss": 0.781138, + "learning_rate": 3.2375607668518745e-06, + "loss": 0.80118817, + "num_input_tokens_seen": 55238830, + "step": 2570, + "time_per_iteration": 2.6853785514831543 + }, + { + "auxiliary_loss_clip": 0.01131163, + "auxiliary_loss_mlp": 0.01087003, + "balance_loss_clip": 1.0338726, + "balance_loss_mlp": 1.00560737, + "epoch": 0.30914447183310284, + "flos": 16068625084800.0, + "grad_norm": 2.1491416588852394, + "language_loss": 0.90146565, + "learning_rate": 3.236948740896377e-06, + "loss": 0.9236474, + "num_input_tokens_seen": 55253630, + "step": 2571, + "time_per_iteration": 2.7247731685638428 + }, + { + "auxiliary_loss_clip": 0.0113955, + "auxiliary_loss_mlp": 0.01087563, + "balance_loss_clip": 1.0333662, + "balance_loss_mlp": 1.00611973, + "epoch": 0.30926471472374195, + "flos": 32230221546240.0, + "grad_norm": 1.4362974706163023, + "language_loss": 0.84001195, + "learning_rate": 3.2363365272967384e-06, + "loss": 0.86228305, + "num_input_tokens_seen": 55276200, + "step": 2572, + "time_per_iteration": 2.7243893146514893 + }, + { + "auxiliary_loss_clip": 0.01132794, + "auxiliary_loss_mlp": 0.01087336, + "balance_loss_clip": 1.02922297, + "balance_loss_mlp": 1.00593972, + "epoch": 0.30938495761438106, + "flos": 20370722970240.0, + "grad_norm": 1.9251137869755361, + "language_loss": 0.81629658, + "learning_rate": 3.235724126145832e-06, + "loss": 0.83849788, + "num_input_tokens_seen": 55292235, + "step": 2573, + "time_per_iteration": 2.687448263168335 + }, + { + "auxiliary_loss_clip": 0.01139531, + "auxiliary_loss_mlp": 0.0108965, + "balance_loss_clip": 1.0325284, + "balance_loss_mlp": 1.00811088, + "epoch": 0.3095052005050201, + "flos": 24061155131520.0, + "grad_norm": 1.485102667002129, + "language_loss": 0.77514756, + "learning_rate": 3.235111537536558e-06, + "loss": 0.7974394, + "num_input_tokens_seen": 55313050, + "step": 2574, + "time_per_iteration": 2.7069952487945557 + }, + { + "auxiliary_loss_clip": 0.01139092, + "auxiliary_loss_mlp": 0.01087813, + "balance_loss_clip": 1.03283858, + "balance_loss_mlp": 1.00651264, + "epoch": 0.30962544339565923, + "flos": 23401547729280.0, + "grad_norm": 1.975461560799717, + "language_loss": 0.82678485, + "learning_rate": 3.2344987615618456e-06, + "loss": 0.84905392, + "num_input_tokens_seen": 55332885, + "step": 2575, + "time_per_iteration": 2.7075719833374023 + }, + { + "auxiliary_loss_clip": 0.01113916, + "auxiliary_loss_mlp": 0.01087871, + "balance_loss_clip": 1.02777267, + "balance_loss_mlp": 1.00652289, + "epoch": 0.30974568628629834, + "flos": 33799984692480.0, + "grad_norm": 1.758453881149616, + "language_loss": 0.78499103, + "learning_rate": 3.2338857983146533e-06, + "loss": 0.80700898, + "num_input_tokens_seen": 55354385, + "step": 2576, + "time_per_iteration": 2.8866987228393555 + }, + { + "auxiliary_loss_clip": 0.01132273, + "auxiliary_loss_mlp": 0.0108751, + "balance_loss_clip": 1.03485489, + "balance_loss_mlp": 1.00601888, + "epoch": 0.3098659291769374, + "flos": 20229594433920.0, + "grad_norm": 2.0537728549922614, + "language_loss": 0.76237214, + "learning_rate": 3.233272647887966e-06, + "loss": 0.78456998, + "num_input_tokens_seen": 55373275, + "step": 2577, + "time_per_iteration": 2.7526681423187256 + }, + { + "auxiliary_loss_clip": 0.01150807, + "auxiliary_loss_mlp": 0.01090618, + "balance_loss_clip": 1.03552163, + "balance_loss_mlp": 1.00922167, + "epoch": 0.3099861720675765, + "flos": 24748556682240.0, + "grad_norm": 1.7094004824408453, + "language_loss": 0.90090519, + "learning_rate": 3.2326593103747985e-06, + "loss": 0.92331946, + "num_input_tokens_seen": 55392290, + "step": 2578, + "time_per_iteration": 2.6956450939178467 + }, + { + "auxiliary_loss_clip": 0.01140621, + "auxiliary_loss_mlp": 0.01086616, + "balance_loss_clip": 1.03509033, + "balance_loss_mlp": 1.00526798, + "epoch": 0.3101064149582156, + "flos": 11765485704960.0, + "grad_norm": 2.003754345592718, + "language_loss": 0.85031211, + "learning_rate": 3.2320457858681936e-06, + "loss": 0.87258458, + "num_input_tokens_seen": 55410680, + "step": 2579, + "time_per_iteration": 3.6352474689483643 + }, + { + "auxiliary_loss_clip": 0.01131578, + "auxiliary_loss_mlp": 0.01089364, + "balance_loss_clip": 1.03362286, + "balance_loss_mlp": 1.00806284, + "epoch": 0.31022665784885467, + "flos": 23033247626880.0, + "grad_norm": 3.610958130839178, + "language_loss": 0.85350388, + "learning_rate": 3.2314320744612228e-06, + "loss": 0.87571329, + "num_input_tokens_seen": 55425980, + "step": 2580, + "time_per_iteration": 2.7706875801086426 + }, + { + "auxiliary_loss_clip": 0.01137488, + "auxiliary_loss_mlp": 0.01087379, + "balance_loss_clip": 1.03191996, + "balance_loss_mlp": 1.00612545, + "epoch": 0.3103469007394938, + "flos": 16289188548480.0, + "grad_norm": 1.9488252383323617, + "language_loss": 0.7631321, + "learning_rate": 3.2308181762469854e-06, + "loss": 0.78538084, + "num_input_tokens_seen": 55443925, + "step": 2581, + "time_per_iteration": 2.6420369148254395 + }, + { + "auxiliary_loss_clip": 0.01151042, + "auxiliary_loss_mlp": 0.01090666, + "balance_loss_clip": 1.0353843, + "balance_loss_mlp": 1.00898361, + "epoch": 0.3104671436301329, + "flos": 30515271626880.0, + "grad_norm": 5.958778654115969, + "language_loss": 0.7801075, + "learning_rate": 3.230204091318609e-06, + "loss": 0.80252457, + "num_input_tokens_seen": 55464465, + "step": 2582, + "time_per_iteration": 2.69401478767395 + }, + { + "auxiliary_loss_clip": 0.01149514, + "auxiliary_loss_mlp": 0.00873723, + "balance_loss_clip": 1.0342778, + "balance_loss_mlp": 1.00039268, + "epoch": 0.31058738652077195, + "flos": 20047240062720.0, + "grad_norm": 2.2750264877255626, + "language_loss": 0.85031044, + "learning_rate": 3.2295898197692503e-06, + "loss": 0.87054282, + "num_input_tokens_seen": 55483425, + "step": 2583, + "time_per_iteration": 2.6409807205200195 + }, + { + "auxiliary_loss_clip": 0.01150273, + "auxiliary_loss_mlp": 0.01087688, + "balance_loss_clip": 1.03515959, + "balance_loss_mlp": 1.00629234, + "epoch": 0.31070762941141106, + "flos": 28074639237120.0, + "grad_norm": 1.957094587512071, + "language_loss": 0.79050648, + "learning_rate": 3.228975361692094e-06, + "loss": 0.81288612, + "num_input_tokens_seen": 55504445, + "step": 2584, + "time_per_iteration": 2.7082362174987793 + }, + { + "auxiliary_loss_clip": 0.01141039, + "auxiliary_loss_mlp": 0.00873845, + "balance_loss_clip": 1.03357244, + "balance_loss_mlp": 1.00046551, + "epoch": 0.31082787230205017, + "flos": 20521907314560.0, + "grad_norm": 2.519956318131037, + "language_loss": 0.79915053, + "learning_rate": 3.228360717180352e-06, + "loss": 0.81929934, + "num_input_tokens_seen": 55521970, + "step": 2585, + "time_per_iteration": 4.468031644821167 + }, + { + "auxiliary_loss_clip": 0.01140777, + "auxiliary_loss_mlp": 0.00872887, + "balance_loss_clip": 1.04094458, + "balance_loss_mlp": 1.00030792, + "epoch": 0.3109481151926892, + "flos": 62445928723200.0, + "grad_norm": 0.8433227317123361, + "language_loss": 0.59426236, + "learning_rate": 3.227745886327266e-06, + "loss": 0.61439902, + "num_input_tokens_seen": 55580665, + "step": 2586, + "time_per_iteration": 3.18959379196167 + }, + { + "auxiliary_loss_clip": 0.01140635, + "auxiliary_loss_mlp": 0.01080653, + "balance_loss_clip": 1.04079628, + "balance_loss_mlp": 1.00092626, + "epoch": 0.31106835808332833, + "flos": 44746744723200.0, + "grad_norm": 0.8171825644372586, + "language_loss": 0.55856568, + "learning_rate": 3.227130869226105e-06, + "loss": 0.58077854, + "num_input_tokens_seen": 55637825, + "step": 2587, + "time_per_iteration": 4.099902629852295 + }, + { + "auxiliary_loss_clip": 0.01139175, + "auxiliary_loss_mlp": 0.01086938, + "balance_loss_clip": 1.03246856, + "balance_loss_mlp": 1.00554204, + "epoch": 0.3111886009739674, + "flos": 23403056100480.0, + "grad_norm": 2.345848712431469, + "language_loss": 0.82852232, + "learning_rate": 3.226515665970167e-06, + "loss": 0.85078335, + "num_input_tokens_seen": 55655365, + "step": 2588, + "time_per_iteration": 2.6796765327453613 + }, + { + "auxiliary_loss_clip": 0.01137355, + "auxiliary_loss_mlp": 0.01087241, + "balance_loss_clip": 1.0321672, + "balance_loss_mlp": 1.00565445, + "epoch": 0.3113088438646065, + "flos": 17530728192000.0, + "grad_norm": 2.2064351983491934, + "language_loss": 0.8653599, + "learning_rate": 3.225900276652777e-06, + "loss": 0.88760591, + "num_input_tokens_seen": 55672140, + "step": 2589, + "time_per_iteration": 2.6721034049987793 + }, + { + "auxiliary_loss_clip": 0.01131222, + "auxiliary_loss_mlp": 0.01088032, + "balance_loss_clip": 1.03294826, + "balance_loss_mlp": 1.00677884, + "epoch": 0.3114290867552456, + "flos": 28365802882560.0, + "grad_norm": 1.648737092658327, + "language_loss": 0.75609559, + "learning_rate": 3.2252847013672906e-06, + "loss": 0.77828813, + "num_input_tokens_seen": 55694800, + "step": 2590, + "time_per_iteration": 2.777385711669922 + }, + { + "auxiliary_loss_clip": 0.01111722, + "auxiliary_loss_mlp": 0.01087982, + "balance_loss_clip": 1.02838588, + "balance_loss_mlp": 1.00663328, + "epoch": 0.31154932964588467, + "flos": 27379157126400.0, + "grad_norm": 1.9580343375336988, + "language_loss": 0.75723451, + "learning_rate": 3.224668940207089e-06, + "loss": 0.77923155, + "num_input_tokens_seen": 55713785, + "step": 2591, + "time_per_iteration": 2.818626642227173 + }, + { + "auxiliary_loss_clip": 0.01112736, + "auxiliary_loss_mlp": 0.01089013, + "balance_loss_clip": 1.0311085, + "balance_loss_mlp": 1.007617, + "epoch": 0.3116695725365238, + "flos": 26541864120960.0, + "grad_norm": 2.0757956192231, + "language_loss": 0.86944884, + "learning_rate": 3.2240529932655828e-06, + "loss": 0.89146638, + "num_input_tokens_seen": 55733050, + "step": 2592, + "time_per_iteration": 2.894751787185669 + }, + { + "auxiliary_loss_clip": 0.01128044, + "auxiliary_loss_mlp": 0.01087351, + "balance_loss_clip": 1.03185034, + "balance_loss_mlp": 1.00595474, + "epoch": 0.3117898154271629, + "flos": 21177600134400.0, + "grad_norm": 3.023766806948941, + "language_loss": 0.88382661, + "learning_rate": 3.223436860636211e-06, + "loss": 0.90598053, + "num_input_tokens_seen": 55748685, + "step": 2593, + "time_per_iteration": 2.7946836948394775 + }, + { + "auxiliary_loss_clip": 0.01149205, + "auxiliary_loss_mlp": 0.01087959, + "balance_loss_clip": 1.03428948, + "balance_loss_mlp": 1.00656295, + "epoch": 0.31191005831780194, + "flos": 27272430840960.0, + "grad_norm": 1.5699692348698187, + "language_loss": 0.74130058, + "learning_rate": 3.2228205424124403e-06, + "loss": 0.76367223, + "num_input_tokens_seen": 55771840, + "step": 2594, + "time_per_iteration": 2.7028307914733887 + }, + { + "auxiliary_loss_clip": 0.01127209, + "auxiliary_loss_mlp": 0.01087376, + "balance_loss_clip": 1.03090405, + "balance_loss_mlp": 1.0059799, + "epoch": 0.31203030120844105, + "flos": 12963501043200.0, + "grad_norm": 2.464043096451702, + "language_loss": 0.74832535, + "learning_rate": 3.222204038687765e-06, + "loss": 0.77047122, + "num_input_tokens_seen": 55784975, + "step": 2595, + "time_per_iteration": 2.7415318489074707 + }, + { + "auxiliary_loss_clip": 0.0113973, + "auxiliary_loss_mlp": 0.01087854, + "balance_loss_clip": 1.03431892, + "balance_loss_mlp": 1.00664842, + "epoch": 0.31215054409908016, + "flos": 27562014288000.0, + "grad_norm": 1.5800993856140246, + "language_loss": 0.88431501, + "learning_rate": 3.221587349555709e-06, + "loss": 0.90659094, + "num_input_tokens_seen": 55805235, + "step": 2596, + "time_per_iteration": 2.7222213745117188 + }, + { + "auxiliary_loss_clip": 0.01132948, + "auxiliary_loss_mlp": 0.01088323, + "balance_loss_clip": 1.03419578, + "balance_loss_mlp": 1.00687909, + "epoch": 0.3122707869897192, + "flos": 21506326427520.0, + "grad_norm": 1.521752781524805, + "language_loss": 0.69250906, + "learning_rate": 3.2209704751098236e-06, + "loss": 0.71472174, + "num_input_tokens_seen": 55824265, + "step": 2597, + "time_per_iteration": 2.7322983741760254 + }, + { + "auxiliary_loss_clip": 0.0112994, + "auxiliary_loss_mlp": 0.01087292, + "balance_loss_clip": 1.03276944, + "balance_loss_mlp": 1.00584793, + "epoch": 0.31239102988035833, + "flos": 15187017674880.0, + "grad_norm": 1.9851630168613128, + "language_loss": 0.83142412, + "learning_rate": 3.2203534154436875e-06, + "loss": 0.85359645, + "num_input_tokens_seen": 55838620, + "step": 2598, + "time_per_iteration": 2.676957607269287 + }, + { + "auxiliary_loss_clip": 0.01095082, + "auxiliary_loss_mlp": 0.01088176, + "balance_loss_clip": 1.02796674, + "balance_loss_mlp": 1.00677955, + "epoch": 0.31251127277099744, + "flos": 22053712763520.0, + "grad_norm": 2.043175105631969, + "language_loss": 0.75392693, + "learning_rate": 3.2197361706509084e-06, + "loss": 0.77575946, + "num_input_tokens_seen": 55859375, + "step": 2599, + "time_per_iteration": 2.873516321182251 + }, + { + "auxiliary_loss_clip": 0.01149115, + "auxiliary_loss_mlp": 0.0108806, + "balance_loss_clip": 1.03386784, + "balance_loss_mlp": 1.00647318, + "epoch": 0.3126315156616365, + "flos": 15193984913280.0, + "grad_norm": 2.567195758449358, + "language_loss": 0.83790952, + "learning_rate": 3.2191187408251228e-06, + "loss": 0.86028129, + "num_input_tokens_seen": 55876535, + "step": 2600, + "time_per_iteration": 2.6173059940338135 + }, + { + "auxiliary_loss_clip": 0.01127119, + "auxiliary_loss_mlp": 0.01087, + "balance_loss_clip": 1.03314662, + "balance_loss_mlp": 1.0054605, + "epoch": 0.3127517585522756, + "flos": 18145338831360.0, + "grad_norm": 2.8279399709622743, + "language_loss": 0.78774351, + "learning_rate": 3.218501126059993e-06, + "loss": 0.80988467, + "num_input_tokens_seen": 55891930, + "step": 2601, + "time_per_iteration": 2.6744589805603027 + }, + { + "auxiliary_loss_clip": 0.01139807, + "auxiliary_loss_mlp": 0.01087113, + "balance_loss_clip": 1.03257799, + "balance_loss_mlp": 1.00571704, + "epoch": 0.31287200144291466, + "flos": 21908633731200.0, + "grad_norm": 1.6808873941140565, + "language_loss": 0.81075072, + "learning_rate": 3.2178833264492116e-06, + "loss": 0.83301985, + "num_input_tokens_seen": 55910635, + "step": 2602, + "time_per_iteration": 2.774832010269165 + }, + { + "auxiliary_loss_clip": 0.01140899, + "auxiliary_loss_mlp": 0.01087961, + "balance_loss_clip": 1.03350818, + "balance_loss_mlp": 1.00646973, + "epoch": 0.31299224433355377, + "flos": 29896997800320.0, + "grad_norm": 1.8475880385507546, + "language_loss": 0.76009107, + "learning_rate": 3.217265342086498e-06, + "loss": 0.78237969, + "num_input_tokens_seen": 55931125, + "step": 2603, + "time_per_iteration": 2.7333312034606934 + }, + { + "auxiliary_loss_clip": 0.01115793, + "auxiliary_loss_mlp": 0.008739, + "balance_loss_clip": 1.02844667, + "balance_loss_mlp": 1.00033164, + "epoch": 0.3131124872241929, + "flos": 11655886331520.0, + "grad_norm": 2.7802987456324355, + "language_loss": 0.72460514, + "learning_rate": 3.216647173065599e-06, + "loss": 0.74450207, + "num_input_tokens_seen": 55946590, + "step": 2604, + "time_per_iteration": 2.8300161361694336 + }, + { + "auxiliary_loss_clip": 0.01127472, + "auxiliary_loss_mlp": 0.0108622, + "balance_loss_clip": 1.03221667, + "balance_loss_mlp": 1.00482368, + "epoch": 0.31323273011483194, + "flos": 49848785470080.0, + "grad_norm": 1.7127196712461183, + "language_loss": 0.73769474, + "learning_rate": 3.216028819480292e-06, + "loss": 0.75983167, + "num_input_tokens_seen": 55967930, + "step": 2605, + "time_per_iteration": 3.840441942214966 + }, + { + "auxiliary_loss_clip": 0.0112991, + "auxiliary_loss_mlp": 0.01086617, + "balance_loss_clip": 1.03333616, + "balance_loss_mlp": 1.00536358, + "epoch": 0.31335297300547105, + "flos": 22601278667520.0, + "grad_norm": 2.003073529424414, + "language_loss": 0.7554968, + "learning_rate": 3.2154102814243793e-06, + "loss": 0.77766204, + "num_input_tokens_seen": 55987070, + "step": 2606, + "time_per_iteration": 2.8755664825439453 + }, + { + "auxiliary_loss_clip": 0.01114212, + "auxiliary_loss_mlp": 0.01087196, + "balance_loss_clip": 1.03060722, + "balance_loss_mlp": 1.00589538, + "epoch": 0.31347321589611016, + "flos": 34710858708480.0, + "grad_norm": 2.1132779765431895, + "language_loss": 0.67130286, + "learning_rate": 3.2147915589916937e-06, + "loss": 0.69331694, + "num_input_tokens_seen": 56008630, + "step": 2607, + "time_per_iteration": 3.0126075744628906 + }, + { + "auxiliary_loss_clip": 0.01133333, + "auxiliary_loss_mlp": 0.01087547, + "balance_loss_clip": 1.03407419, + "balance_loss_mlp": 1.00619864, + "epoch": 0.3135934587867492, + "flos": 19755789108480.0, + "grad_norm": 2.103501193909835, + "language_loss": 0.82849348, + "learning_rate": 3.2141726522760938e-06, + "loss": 0.85070223, + "num_input_tokens_seen": 56026690, + "step": 2608, + "time_per_iteration": 2.7621586322784424 + }, + { + "auxiliary_loss_clip": 0.01132085, + "auxiliary_loss_mlp": 0.01079824, + "balance_loss_clip": 1.04058492, + "balance_loss_mlp": 1.00009727, + "epoch": 0.3137137016773883, + "flos": 65815535583360.0, + "grad_norm": 0.7042872077152019, + "language_loss": 0.52665639, + "learning_rate": 3.213553561371469e-06, + "loss": 0.54877555, + "num_input_tokens_seen": 56090425, + "step": 2609, + "time_per_iteration": 3.4351136684417725 + }, + { + "auxiliary_loss_clip": 0.01111653, + "auxiliary_loss_mlp": 0.01087401, + "balance_loss_clip": 1.03235877, + "balance_loss_mlp": 1.00619578, + "epoch": 0.31383394456802743, + "flos": 16252739222400.0, + "grad_norm": 2.1538388192990854, + "language_loss": 0.95748663, + "learning_rate": 3.212934286371733e-06, + "loss": 0.97947711, + "num_input_tokens_seen": 56107135, + "step": 2610, + "time_per_iteration": 3.8682994842529297 + }, + { + "auxiliary_loss_clip": 0.0113543, + "auxiliary_loss_mlp": 0.01086771, + "balance_loss_clip": 1.03140545, + "balance_loss_mlp": 1.0051837, + "epoch": 0.3139541874586665, + "flos": 38795517613440.0, + "grad_norm": 2.2158691617109927, + "language_loss": 0.83205986, + "learning_rate": 3.2123148273708304e-06, + "loss": 0.8542819, + "num_input_tokens_seen": 56127325, + "step": 2611, + "time_per_iteration": 3.928952217102051 + }, + { + "auxiliary_loss_clip": 0.01150003, + "auxiliary_loss_mlp": 0.01087715, + "balance_loss_clip": 1.03501272, + "balance_loss_mlp": 1.00631893, + "epoch": 0.3140744303493056, + "flos": 25046328430080.0, + "grad_norm": 1.7695409262404351, + "language_loss": 0.76241183, + "learning_rate": 3.211695184462733e-06, + "loss": 0.78478897, + "num_input_tokens_seen": 56148500, + "step": 2612, + "time_per_iteration": 2.6841373443603516 + }, + { + "auxiliary_loss_clip": 0.01098842, + "auxiliary_loss_mlp": 0.01079771, + "balance_loss_clip": 1.03862453, + "balance_loss_mlp": 1.00004411, + "epoch": 0.3141946732399447, + "flos": 72504254782080.0, + "grad_norm": 0.892371920996409, + "language_loss": 0.60444593, + "learning_rate": 3.2110753577414383e-06, + "loss": 0.62623203, + "num_input_tokens_seen": 56210080, + "step": 2613, + "time_per_iteration": 4.2522430419921875 + }, + { + "auxiliary_loss_clip": 0.01130884, + "auxiliary_loss_mlp": 0.01087453, + "balance_loss_clip": 1.03308976, + "balance_loss_mlp": 1.00600946, + "epoch": 0.31431491613058377, + "flos": 19239788280960.0, + "grad_norm": 1.6983436112694459, + "language_loss": 0.79008919, + "learning_rate": 3.2104553473009757e-06, + "loss": 0.81227255, + "num_input_tokens_seen": 56228200, + "step": 2614, + "time_per_iteration": 2.702861785888672 + }, + { + "auxiliary_loss_clip": 0.0110732, + "auxiliary_loss_mlp": 0.01087338, + "balance_loss_clip": 1.02849698, + "balance_loss_mlp": 1.00598931, + "epoch": 0.3144351590212229, + "flos": 36210596290560.0, + "grad_norm": 2.0353895868992615, + "language_loss": 0.68076766, + "learning_rate": 3.209835153235399e-06, + "loss": 0.70271432, + "num_input_tokens_seen": 56249755, + "step": 2615, + "time_per_iteration": 2.949172019958496 + }, + { + "auxiliary_loss_clip": 0.01114215, + "auxiliary_loss_mlp": 0.01088086, + "balance_loss_clip": 1.03685045, + "balance_loss_mlp": 1.00688028, + "epoch": 0.314555401911862, + "flos": 18551740285440.0, + "grad_norm": 1.7210331550342097, + "language_loss": 0.67879975, + "learning_rate": 3.2092147756387916e-06, + "loss": 0.70082277, + "num_input_tokens_seen": 56270080, + "step": 2616, + "time_per_iteration": 2.8146233558654785 + }, + { + "auxiliary_loss_clip": 0.01133637, + "auxiliary_loss_mlp": 0.0108787, + "balance_loss_clip": 1.03441882, + "balance_loss_mlp": 1.0064261, + "epoch": 0.31467564480250104, + "flos": 16362877299840.0, + "grad_norm": 2.0519961229708907, + "language_loss": 0.83481848, + "learning_rate": 3.208594214605264e-06, + "loss": 0.85703349, + "num_input_tokens_seen": 56288625, + "step": 2617, + "time_per_iteration": 2.7729344367980957 + }, + { + "auxiliary_loss_clip": 0.01129742, + "auxiliary_loss_mlp": 0.01087738, + "balance_loss_clip": 1.03280509, + "balance_loss_mlp": 1.00662804, + "epoch": 0.31479588769314015, + "flos": 21652375127040.0, + "grad_norm": 2.0649754383095336, + "language_loss": 0.7740984, + "learning_rate": 3.2079734702289553e-06, + "loss": 0.79627311, + "num_input_tokens_seen": 56307520, + "step": 2618, + "time_per_iteration": 2.746514081954956 + }, + { + "auxiliary_loss_clip": 0.01129628, + "auxiliary_loss_mlp": 0.00873216, + "balance_loss_clip": 1.03854382, + "balance_loss_mlp": 1.00070155, + "epoch": 0.3149161305837792, + "flos": 66051072040320.0, + "grad_norm": 0.818112055627154, + "language_loss": 0.60442358, + "learning_rate": 3.207352542604031e-06, + "loss": 0.62445199, + "num_input_tokens_seen": 56369855, + "step": 2619, + "time_per_iteration": 3.3306384086608887 + }, + { + "auxiliary_loss_clip": 0.01112647, + "auxiliary_loss_mlp": 0.01088415, + "balance_loss_clip": 1.03007746, + "balance_loss_mlp": 1.00716174, + "epoch": 0.3150363734744183, + "flos": 28987201192320.0, + "grad_norm": 1.5256467205971473, + "language_loss": 0.78575325, + "learning_rate": 3.2067314318246864e-06, + "loss": 0.80776381, + "num_input_tokens_seen": 56390570, + "step": 2620, + "time_per_iteration": 2.878098726272583 + }, + { + "auxiliary_loss_clip": 0.01118985, + "auxiliary_loss_mlp": 0.01089836, + "balance_loss_clip": 1.03092885, + "balance_loss_mlp": 1.00844026, + "epoch": 0.31515661636505743, + "flos": 27636600879360.0, + "grad_norm": 1.7218128775408568, + "language_loss": 0.77638507, + "learning_rate": 3.206110137985143e-06, + "loss": 0.7984733, + "num_input_tokens_seen": 56410775, + "step": 2621, + "time_per_iteration": 2.84682297706604 + }, + { + "auxiliary_loss_clip": 0.01111663, + "auxiliary_loss_mlp": 0.01087694, + "balance_loss_clip": 1.02875924, + "balance_loss_mlp": 1.00620222, + "epoch": 0.3152768592556965, + "flos": 24605632465920.0, + "grad_norm": 1.8374535108197092, + "language_loss": 0.92551941, + "learning_rate": 3.2054886611796505e-06, + "loss": 0.94751292, + "num_input_tokens_seen": 56429770, + "step": 2622, + "time_per_iteration": 2.8440239429473877 + }, + { + "auxiliary_loss_clip": 0.01139768, + "auxiliary_loss_mlp": 0.01079782, + "balance_loss_clip": 1.03997171, + "balance_loss_mlp": 1.00005519, + "epoch": 0.3153971021463356, + "flos": 68476908026880.0, + "grad_norm": 0.8844796888020353, + "language_loss": 0.63611269, + "learning_rate": 3.204867001502487e-06, + "loss": 0.65830815, + "num_input_tokens_seen": 56488425, + "step": 2623, + "time_per_iteration": 3.191598415374756 + }, + { + "auxiliary_loss_clip": 0.01152461, + "auxiliary_loss_mlp": 0.01088825, + "balance_loss_clip": 1.03705108, + "balance_loss_mlp": 1.00742936, + "epoch": 0.3155173450369747, + "flos": 25593714766080.0, + "grad_norm": 1.6867153254992462, + "language_loss": 0.808568, + "learning_rate": 3.2042451590479567e-06, + "loss": 0.83098078, + "num_input_tokens_seen": 56508940, + "step": 2624, + "time_per_iteration": 2.6901516914367676 + }, + { + "auxiliary_loss_clip": 0.01149727, + "auxiliary_loss_mlp": 0.01087242, + "balance_loss_clip": 1.03516209, + "balance_loss_mlp": 1.00608444, + "epoch": 0.31563758792761376, + "flos": 24309333175680.0, + "grad_norm": 1.8628444966937936, + "language_loss": 0.87007493, + "learning_rate": 3.203623133910394e-06, + "loss": 0.89244461, + "num_input_tokens_seen": 56527245, + "step": 2625, + "time_per_iteration": 2.715071439743042 + }, + { + "auxiliary_loss_clip": 0.01087076, + "auxiliary_loss_mlp": 0.01086792, + "balance_loss_clip": 1.02969444, + "balance_loss_mlp": 1.00544357, + "epoch": 0.31575783081825287, + "flos": 31903865550720.0, + "grad_norm": 2.195575811963249, + "language_loss": 0.76884985, + "learning_rate": 3.203000926184158e-06, + "loss": 0.79058862, + "num_input_tokens_seen": 56546170, + "step": 2626, + "time_per_iteration": 2.865720748901367 + }, + { + "auxiliary_loss_clip": 0.01150435, + "auxiliary_loss_mlp": 0.01087554, + "balance_loss_clip": 1.03518331, + "balance_loss_mlp": 1.00630093, + "epoch": 0.315878073708892, + "flos": 30810960385920.0, + "grad_norm": 1.7351306247643117, + "language_loss": 0.77678275, + "learning_rate": 3.202378535963639e-06, + "loss": 0.79916263, + "num_input_tokens_seen": 56567085, + "step": 2627, + "time_per_iteration": 2.709106206893921 + }, + { + "auxiliary_loss_clip": 0.01131187, + "auxiliary_loss_mlp": 0.00873922, + "balance_loss_clip": 1.03329408, + "balance_loss_mlp": 1.0003252, + "epoch": 0.31599831659953104, + "flos": 22200264253440.0, + "grad_norm": 1.5800595594537779, + "language_loss": 0.83830786, + "learning_rate": 3.2017559633432516e-06, + "loss": 0.85835898, + "num_input_tokens_seen": 56586715, + "step": 2628, + "time_per_iteration": 2.7185308933258057 + }, + { + "auxiliary_loss_clip": 0.01120253, + "auxiliary_loss_mlp": 0.01089114, + "balance_loss_clip": 1.03446269, + "balance_loss_mlp": 1.0076704, + "epoch": 0.31611855949017015, + "flos": 25593463370880.0, + "grad_norm": 1.753458497949332, + "language_loss": 0.66447526, + "learning_rate": 3.2011332084174398e-06, + "loss": 0.68656898, + "num_input_tokens_seen": 56607585, + "step": 2629, + "time_per_iteration": 2.7594218254089355 + }, + { + "auxiliary_loss_clip": 0.0113607, + "auxiliary_loss_mlp": 0.01088123, + "balance_loss_clip": 1.03171206, + "balance_loss_mlp": 1.00663185, + "epoch": 0.31623880238080926, + "flos": 20594087694720.0, + "grad_norm": 1.525706038362431, + "language_loss": 0.89266056, + "learning_rate": 3.2005102712806756e-06, + "loss": 0.91490245, + "num_input_tokens_seen": 56626415, + "step": 2630, + "time_per_iteration": 2.668415069580078 + }, + { + "auxiliary_loss_clip": 0.01140989, + "auxiliary_loss_mlp": 0.01090002, + "balance_loss_clip": 1.03380084, + "balance_loss_mlp": 1.00850999, + "epoch": 0.3163590452714483, + "flos": 12784917600000.0, + "grad_norm": 1.8968203998591717, + "language_loss": 0.72645175, + "learning_rate": 3.1998871520274575e-06, + "loss": 0.74876165, + "num_input_tokens_seen": 56641750, + "step": 2631, + "time_per_iteration": 3.5518229007720947 + }, + { + "auxiliary_loss_clip": 0.01132542, + "auxiliary_loss_mlp": 0.01086502, + "balance_loss_clip": 1.0330708, + "balance_loss_mlp": 1.00510585, + "epoch": 0.3164792881620874, + "flos": 23041292273280.0, + "grad_norm": 1.746322914728066, + "language_loss": 0.84914023, + "learning_rate": 3.199263850752312e-06, + "loss": 0.87133068, + "num_input_tokens_seen": 56662585, + "step": 2632, + "time_per_iteration": 2.8925912380218506 + }, + { + "auxiliary_loss_clip": 0.01140747, + "auxiliary_loss_mlp": 0.01089864, + "balance_loss_clip": 1.03419852, + "balance_loss_mlp": 1.00842047, + "epoch": 0.31659953105272653, + "flos": 18296271780480.0, + "grad_norm": 1.9524093254991293, + "language_loss": 0.85634124, + "learning_rate": 3.198640367549795e-06, + "loss": 0.87864739, + "num_input_tokens_seen": 56681480, + "step": 2633, + "time_per_iteration": 2.6316771507263184 + }, + { + "auxiliary_loss_clip": 0.01139284, + "auxiliary_loss_mlp": 0.00873766, + "balance_loss_clip": 1.0325489, + "balance_loss_mlp": 1.00031269, + "epoch": 0.3167197739433656, + "flos": 25703421880320.0, + "grad_norm": 1.7548635947333489, + "language_loss": 0.85604334, + "learning_rate": 3.198016702514487e-06, + "loss": 0.87617385, + "num_input_tokens_seen": 56701760, + "step": 2634, + "time_per_iteration": 2.8124468326568604 + }, + { + "auxiliary_loss_clip": 0.01148401, + "auxiliary_loss_mlp": 0.01085494, + "balance_loss_clip": 1.03361166, + "balance_loss_mlp": 1.00438356, + "epoch": 0.3168400168340047, + "flos": 23546016230400.0, + "grad_norm": 3.691704085364603, + "language_loss": 0.84225172, + "learning_rate": 3.1973928557409972e-06, + "loss": 0.86459076, + "num_input_tokens_seen": 56719800, + "step": 2635, + "time_per_iteration": 2.7051455974578857 + }, + { + "auxiliary_loss_clip": 0.01147564, + "auxiliary_loss_mlp": 0.01088113, + "balance_loss_clip": 1.03296232, + "balance_loss_mlp": 1.00676501, + "epoch": 0.31696025972464376, + "flos": 28366449327360.0, + "grad_norm": 1.9370246964746203, + "language_loss": 0.71335018, + "learning_rate": 3.1967688273239636e-06, + "loss": 0.73570693, + "num_input_tokens_seen": 56739605, + "step": 2636, + "time_per_iteration": 3.6295971870422363 + }, + { + "auxiliary_loss_clip": 0.0110578, + "auxiliary_loss_mlp": 0.01087677, + "balance_loss_clip": 1.03123093, + "balance_loss_mlp": 1.00637662, + "epoch": 0.31708050261528287, + "flos": 16399111144320.0, + "grad_norm": 1.73224430823807, + "language_loss": 0.82144439, + "learning_rate": 3.1961446173580503e-06, + "loss": 0.8433789, + "num_input_tokens_seen": 56756545, + "step": 2637, + "time_per_iteration": 2.7941224575042725 + }, + { + "auxiliary_loss_clip": 0.01129372, + "auxiliary_loss_mlp": 0.01088549, + "balance_loss_clip": 1.03286719, + "balance_loss_mlp": 1.00729609, + "epoch": 0.317200745505922, + "flos": 26212347728640.0, + "grad_norm": 1.6434154262976013, + "language_loss": 0.7739172, + "learning_rate": 3.1955202259379502e-06, + "loss": 0.79609644, + "num_input_tokens_seen": 56778275, + "step": 2638, + "time_per_iteration": 3.720491647720337 + }, + { + "auxiliary_loss_clip": 0.01139713, + "auxiliary_loss_mlp": 0.01087833, + "balance_loss_clip": 1.0328306, + "balance_loss_mlp": 1.00658, + "epoch": 0.31732098839656103, + "flos": 31350876693120.0, + "grad_norm": 1.7317109394495365, + "language_loss": 0.82898742, + "learning_rate": 3.194895653158381e-06, + "loss": 0.85126287, + "num_input_tokens_seen": 56797215, + "step": 2639, + "time_per_iteration": 2.7922332286834717 + }, + { + "auxiliary_loss_clip": 0.01137878, + "auxiliary_loss_mlp": 0.01079794, + "balance_loss_clip": 1.03808117, + "balance_loss_mlp": 1.00044787, + "epoch": 0.31744123128720014, + "flos": 58989024835200.0, + "grad_norm": 0.7672062117367588, + "language_loss": 0.5554111, + "learning_rate": 3.194270899114093e-06, + "loss": 0.57758784, + "num_input_tokens_seen": 56863010, + "step": 2640, + "time_per_iteration": 3.368121862411499 + }, + { + "auxiliary_loss_clip": 0.01141714, + "auxiliary_loss_mlp": 0.01089023, + "balance_loss_clip": 1.03420413, + "balance_loss_mlp": 1.00753164, + "epoch": 0.31756147417783925, + "flos": 17417573372160.0, + "grad_norm": 1.768960801791442, + "language_loss": 0.82021922, + "learning_rate": 3.193645963899858e-06, + "loss": 0.84252656, + "num_input_tokens_seen": 56880625, + "step": 2641, + "time_per_iteration": 2.7262015342712402 + }, + { + "auxiliary_loss_clip": 0.01122733, + "auxiliary_loss_mlp": 0.01088246, + "balance_loss_clip": 1.03069377, + "balance_loss_mlp": 1.00684941, + "epoch": 0.3176817170684783, + "flos": 25481673267840.0, + "grad_norm": 4.1514867797808215, + "language_loss": 0.83875138, + "learning_rate": 3.193020847610479e-06, + "loss": 0.86086118, + "num_input_tokens_seen": 56900945, + "step": 2642, + "time_per_iteration": 2.7757489681243896 + }, + { + "auxiliary_loss_clip": 0.01132534, + "auxiliary_loss_mlp": 0.01087306, + "balance_loss_clip": 1.03452277, + "balance_loss_mlp": 1.00595808, + "epoch": 0.3178019599591174, + "flos": 24972603765120.0, + "grad_norm": 2.199732573985027, + "language_loss": 0.70845354, + "learning_rate": 3.192395550340787e-06, + "loss": 0.73065197, + "num_input_tokens_seen": 56918895, + "step": 2643, + "time_per_iteration": 2.7238237857818604 + }, + { + "auxiliary_loss_clip": 0.0113711, + "auxiliary_loss_mlp": 0.01088281, + "balance_loss_clip": 1.03212786, + "balance_loss_mlp": 1.00702775, + "epoch": 0.31792220284975653, + "flos": 12422220019200.0, + "grad_norm": 1.8396703227194497, + "language_loss": 0.76451492, + "learning_rate": 3.191770072185638e-06, + "loss": 0.78676879, + "num_input_tokens_seen": 56935890, + "step": 2644, + "time_per_iteration": 2.709601640701294 + }, + { + "auxiliary_loss_clip": 0.01130964, + "auxiliary_loss_mlp": 0.01087153, + "balance_loss_clip": 1.03075969, + "balance_loss_mlp": 1.00589967, + "epoch": 0.3180424457403956, + "flos": 15485759089920.0, + "grad_norm": 2.267283886930511, + "language_loss": 0.72703046, + "learning_rate": 3.191144413239916e-06, + "loss": 0.74921161, + "num_input_tokens_seen": 56952460, + "step": 2645, + "time_per_iteration": 2.6868033409118652 + }, + { + "auxiliary_loss_clip": 0.01131098, + "auxiliary_loss_mlp": 0.0108977, + "balance_loss_clip": 1.03345418, + "balance_loss_mlp": 1.00823057, + "epoch": 0.3181626886310347, + "flos": 26174964648960.0, + "grad_norm": 1.9865260396262003, + "language_loss": 0.88302803, + "learning_rate": 3.190518573598534e-06, + "loss": 0.90523672, + "num_input_tokens_seen": 56969065, + "step": 2646, + "time_per_iteration": 2.8292977809906006 + }, + { + "auxiliary_loss_clip": 0.0111019, + "auxiliary_loss_mlp": 0.01087613, + "balance_loss_clip": 1.033167, + "balance_loss_mlp": 1.00621653, + "epoch": 0.3182829315216738, + "flos": 25483109811840.0, + "grad_norm": 1.6963146278204462, + "language_loss": 0.77496481, + "learning_rate": 3.1898925533564308e-06, + "loss": 0.79694283, + "num_input_tokens_seen": 56990535, + "step": 2647, + "time_per_iteration": 2.9836363792419434 + }, + { + "auxiliary_loss_clip": 0.01116055, + "auxiliary_loss_mlp": 0.01087383, + "balance_loss_clip": 1.02843368, + "balance_loss_mlp": 1.00608242, + "epoch": 0.31840317441231286, + "flos": 18113701927680.0, + "grad_norm": 2.0325548994918403, + "language_loss": 0.64017034, + "learning_rate": 3.1892663526085733e-06, + "loss": 0.66220474, + "num_input_tokens_seen": 57008910, + "step": 2648, + "time_per_iteration": 2.6822774410247803 + }, + { + "auxiliary_loss_clip": 0.01137388, + "auxiliary_loss_mlp": 0.01079808, + "balance_loss_clip": 1.03759849, + "balance_loss_mlp": 1.0000813, + "epoch": 0.31852341730295197, + "flos": 64741948957440.0, + "grad_norm": 0.7427217879657525, + "language_loss": 0.56962991, + "learning_rate": 3.188639971449956e-06, + "loss": 0.59180188, + "num_input_tokens_seen": 57074960, + "step": 2649, + "time_per_iteration": 3.3571736812591553 + }, + { + "auxiliary_loss_clip": 0.01149995, + "auxiliary_loss_mlp": 0.0108816, + "balance_loss_clip": 1.03494453, + "balance_loss_mlp": 1.00676394, + "epoch": 0.318643660193591, + "flos": 20668135582080.0, + "grad_norm": 3.620681135344392, + "language_loss": 0.72818398, + "learning_rate": 3.1880134099756e-06, + "loss": 0.75056547, + "num_input_tokens_seen": 57094595, + "step": 2650, + "time_per_iteration": 2.622864007949829 + }, + { + "auxiliary_loss_clip": 0.0113945, + "auxiliary_loss_mlp": 0.01087213, + "balance_loss_clip": 1.03260314, + "balance_loss_mlp": 1.00596046, + "epoch": 0.31876390308423014, + "flos": 26943345411840.0, + "grad_norm": 1.8430407662928654, + "language_loss": 0.69283211, + "learning_rate": 3.1873866682805535e-06, + "loss": 0.71509874, + "num_input_tokens_seen": 57115290, + "step": 2651, + "time_per_iteration": 2.741116762161255 + }, + { + "auxiliary_loss_clip": 0.01116456, + "auxiliary_loss_mlp": 0.01087373, + "balance_loss_clip": 1.03276634, + "balance_loss_mlp": 1.00592971, + "epoch": 0.31888414597486925, + "flos": 18041916597120.0, + "grad_norm": 1.8556951345638426, + "language_loss": 0.88421273, + "learning_rate": 3.186759746459894e-06, + "loss": 0.90625101, + "num_input_tokens_seen": 57134400, + "step": 2652, + "time_per_iteration": 2.7226507663726807 + }, + { + "auxiliary_loss_clip": 0.0112545, + "auxiliary_loss_mlp": 0.01086953, + "balance_loss_clip": 1.02985024, + "balance_loss_mlp": 1.00560427, + "epoch": 0.3190043888655083, + "flos": 25149319701120.0, + "grad_norm": 1.7151225543972228, + "language_loss": 0.79609144, + "learning_rate": 3.1861326446087246e-06, + "loss": 0.81821543, + "num_input_tokens_seen": 57153140, + "step": 2653, + "time_per_iteration": 2.7441301345825195 + }, + { + "auxiliary_loss_clip": 0.01139553, + "auxiliary_loss_mlp": 0.01087476, + "balance_loss_clip": 1.03281343, + "balance_loss_mlp": 1.00617588, + "epoch": 0.3191246317561474, + "flos": 22053892331520.0, + "grad_norm": 1.9545688543360173, + "language_loss": 0.71742392, + "learning_rate": 3.1855053628221763e-06, + "loss": 0.73969424, + "num_input_tokens_seen": 57172395, + "step": 2654, + "time_per_iteration": 2.720852851867676 + }, + { + "auxiliary_loss_clip": 0.01122015, + "auxiliary_loss_mlp": 0.01087882, + "balance_loss_clip": 1.03266549, + "balance_loss_mlp": 1.00643814, + "epoch": 0.3192448746467865, + "flos": 14901815687040.0, + "grad_norm": 2.474076229069164, + "language_loss": 0.89530575, + "learning_rate": 3.184877901195407e-06, + "loss": 0.91740477, + "num_input_tokens_seen": 57189090, + "step": 2655, + "time_per_iteration": 2.7783830165863037 + }, + { + "auxiliary_loss_clip": 0.01108549, + "auxiliary_loss_mlp": 0.0108428, + "balance_loss_clip": 1.02611935, + "balance_loss_mlp": 1.00455248, + "epoch": 0.3193651175374256, + "flos": 67234832657280.0, + "grad_norm": 0.7916971542051614, + "language_loss": 0.62863553, + "learning_rate": 3.184250259823602e-06, + "loss": 0.65056384, + "num_input_tokens_seen": 57251620, + "step": 2656, + "time_per_iteration": 4.253613233566284 + }, + { + "auxiliary_loss_clip": 0.01116493, + "auxiliary_loss_mlp": 0.01088762, + "balance_loss_clip": 1.02948475, + "balance_loss_mlp": 1.00741315, + "epoch": 0.3194853604280647, + "flos": 12233077977600.0, + "grad_norm": 9.948129188774137, + "language_loss": 0.81159765, + "learning_rate": 3.183622438801974e-06, + "loss": 0.83365017, + "num_input_tokens_seen": 57266910, + "step": 2657, + "time_per_iteration": 2.7267351150512695 + }, + { + "auxiliary_loss_clip": 0.01151255, + "auxiliary_loss_mlp": 0.01088052, + "balance_loss_clip": 1.03629756, + "balance_loss_mlp": 1.00675154, + "epoch": 0.3196056033187038, + "flos": 14939917038720.0, + "grad_norm": 1.8318468392018254, + "language_loss": 0.75129604, + "learning_rate": 3.1829944382257637e-06, + "loss": 0.77368909, + "num_input_tokens_seen": 57285040, + "step": 2658, + "time_per_iteration": 2.6289100646972656 + }, + { + "auxiliary_loss_clip": 0.01136526, + "auxiliary_loss_mlp": 0.010873, + "balance_loss_clip": 1.032233, + "balance_loss_mlp": 1.00609446, + "epoch": 0.31972584620934286, + "flos": 23768878164480.0, + "grad_norm": 2.377945223947707, + "language_loss": 0.81458533, + "learning_rate": 3.1823662581902373e-06, + "loss": 0.83682358, + "num_input_tokens_seen": 57302725, + "step": 2659, + "time_per_iteration": 2.71826434135437 + }, + { + "auxiliary_loss_clip": 0.01122197, + "auxiliary_loss_mlp": 0.01089035, + "balance_loss_clip": 1.03086364, + "balance_loss_mlp": 1.00759149, + "epoch": 0.31984608909998197, + "flos": 21251540280960.0, + "grad_norm": 2.2031838034468203, + "language_loss": 0.74748778, + "learning_rate": 3.1817378987906896e-06, + "loss": 0.76960015, + "num_input_tokens_seen": 57322230, + "step": 2660, + "time_per_iteration": 2.8206441402435303 + }, + { + "auxiliary_loss_clip": 0.0109723, + "auxiliary_loss_mlp": 0.01088835, + "balance_loss_clip": 1.02614176, + "balance_loss_mlp": 1.00767708, + "epoch": 0.3199663319906211, + "flos": 18296235866880.0, + "grad_norm": 2.1450374952170277, + "language_loss": 0.79677302, + "learning_rate": 3.181109360122442e-06, + "loss": 0.81863368, + "num_input_tokens_seen": 57339820, + "step": 2661, + "time_per_iteration": 3.668757438659668 + }, + { + "auxiliary_loss_clip": 0.01114281, + "auxiliary_loss_mlp": 0.01086136, + "balance_loss_clip": 1.02746212, + "balance_loss_mlp": 1.00488269, + "epoch": 0.32008657488126013, + "flos": 18733627779840.0, + "grad_norm": 1.9936573097240984, + "language_loss": 0.7788465, + "learning_rate": 3.1804806422808445e-06, + "loss": 0.80085075, + "num_input_tokens_seen": 57356955, + "step": 2662, + "time_per_iteration": 3.7988033294677734 + }, + { + "auxiliary_loss_clip": 0.01123947, + "auxiliary_loss_mlp": 0.01088223, + "balance_loss_clip": 1.03182054, + "balance_loss_mlp": 1.00682688, + "epoch": 0.32020681777189924, + "flos": 20595344670720.0, + "grad_norm": 1.6345873795975665, + "language_loss": 0.72956157, + "learning_rate": 3.1798517453612714e-06, + "loss": 0.75168329, + "num_input_tokens_seen": 57376760, + "step": 2663, + "time_per_iteration": 2.7914319038391113 + }, + { + "auxiliary_loss_clip": 0.01134051, + "auxiliary_loss_mlp": 0.01086468, + "balance_loss_clip": 1.03006423, + "balance_loss_mlp": 1.00531054, + "epoch": 0.32032706066253835, + "flos": 35261692750080.0, + "grad_norm": 1.7101885115351618, + "language_loss": 0.75274658, + "learning_rate": 3.1792226694591265e-06, + "loss": 0.77495182, + "num_input_tokens_seen": 57398145, + "step": 2664, + "time_per_iteration": 3.8349411487579346 + }, + { + "auxiliary_loss_clip": 0.01111768, + "auxiliary_loss_mlp": 0.0108726, + "balance_loss_clip": 1.02950478, + "balance_loss_mlp": 1.00619793, + "epoch": 0.3204473035531774, + "flos": 15304230731520.0, + "grad_norm": 3.2844953345138155, + "language_loss": 0.80237699, + "learning_rate": 3.178593414669841e-06, + "loss": 0.82436723, + "num_input_tokens_seen": 57416730, + "step": 2665, + "time_per_iteration": 2.7396960258483887 + }, + { + "auxiliary_loss_clip": 0.01134377, + "auxiliary_loss_mlp": 0.01086824, + "balance_loss_clip": 1.03030133, + "balance_loss_mlp": 1.00538039, + "epoch": 0.3205675464438165, + "flos": 24462564595200.0, + "grad_norm": 2.33540519904257, + "language_loss": 0.70608002, + "learning_rate": 3.1779639810888707e-06, + "loss": 0.72829199, + "num_input_tokens_seen": 57436325, + "step": 2666, + "time_per_iteration": 2.8732683658599854 + }, + { + "auxiliary_loss_clip": 0.01139057, + "auxiliary_loss_mlp": 0.01087181, + "balance_loss_clip": 1.03324938, + "balance_loss_mlp": 1.00588059, + "epoch": 0.3206877893344556, + "flos": 22456235548800.0, + "grad_norm": 5.078690054718468, + "language_loss": 0.75926769, + "learning_rate": 3.1773343688117013e-06, + "loss": 0.78153014, + "num_input_tokens_seen": 57457235, + "step": 2667, + "time_per_iteration": 2.7751686573028564 + }, + { + "auxiliary_loss_clip": 0.01132456, + "auxiliary_loss_mlp": 0.00873702, + "balance_loss_clip": 1.03352487, + "balance_loss_mlp": 1.00022745, + "epoch": 0.3208080322250947, + "flos": 20412236113920.0, + "grad_norm": 1.9705309078523083, + "language_loss": 0.84167135, + "learning_rate": 3.1767045779338445e-06, + "loss": 0.8617329, + "num_input_tokens_seen": 57474895, + "step": 2668, + "time_per_iteration": 2.7262022495269775 + }, + { + "auxiliary_loss_clip": 0.01140679, + "auxiliary_loss_mlp": 0.01087302, + "balance_loss_clip": 1.03347886, + "balance_loss_mlp": 1.00590611, + "epoch": 0.3209282751157338, + "flos": 21762118154880.0, + "grad_norm": 2.0161176416205513, + "language_loss": 0.9130851, + "learning_rate": 3.176074608550839e-06, + "loss": 0.93536496, + "num_input_tokens_seen": 57490715, + "step": 2669, + "time_per_iteration": 2.6994457244873047 + }, + { + "auxiliary_loss_clip": 0.01099738, + "auxiliary_loss_mlp": 0.01088775, + "balance_loss_clip": 1.02902985, + "balance_loss_mlp": 1.00737882, + "epoch": 0.32104851800637285, + "flos": 22055041566720.0, + "grad_norm": 2.0525861746450107, + "language_loss": 0.82473195, + "learning_rate": 3.17544446075825e-06, + "loss": 0.8466171, + "num_input_tokens_seen": 57509880, + "step": 2670, + "time_per_iteration": 2.817026138305664 + }, + { + "auxiliary_loss_clip": 0.0113101, + "auxiliary_loss_mlp": 0.01087193, + "balance_loss_clip": 1.03239369, + "balance_loss_mlp": 1.00603485, + "epoch": 0.32116876089701196, + "flos": 37012301896320.0, + "grad_norm": 1.645810787671094, + "language_loss": 0.71096539, + "learning_rate": 3.174814134651671e-06, + "loss": 0.73314738, + "num_input_tokens_seen": 57532430, + "step": 2671, + "time_per_iteration": 2.8411099910736084 + }, + { + "auxiliary_loss_clip": 0.01148515, + "auxiliary_loss_mlp": 0.01087259, + "balance_loss_clip": 1.03424358, + "balance_loss_mlp": 1.00614953, + "epoch": 0.3212890037876511, + "flos": 21979233912960.0, + "grad_norm": 1.7381339764874768, + "language_loss": 0.80713451, + "learning_rate": 3.1741836303267215e-06, + "loss": 0.82949221, + "num_input_tokens_seen": 57551965, + "step": 2672, + "time_per_iteration": 2.6636431217193604 + }, + { + "auxiliary_loss_clip": 0.01148957, + "auxiliary_loss_mlp": 0.0108585, + "balance_loss_clip": 1.03463769, + "balance_loss_mlp": 1.00474024, + "epoch": 0.32140924667829013, + "flos": 10342345875840.0, + "grad_norm": 1.8469552164115755, + "language_loss": 0.75038457, + "learning_rate": 3.1735529478790496e-06, + "loss": 0.77273262, + "num_input_tokens_seen": 57569955, + "step": 2673, + "time_per_iteration": 2.728708028793335 + }, + { + "auxiliary_loss_clip": 0.01137358, + "auxiliary_loss_mlp": 0.01088263, + "balance_loss_clip": 1.03164411, + "balance_loss_mlp": 1.00681913, + "epoch": 0.32152948956892924, + "flos": 50798910072960.0, + "grad_norm": 1.788529275453582, + "language_loss": 0.79604125, + "learning_rate": 3.172922087404328e-06, + "loss": 0.81829739, + "num_input_tokens_seen": 57592215, + "step": 2674, + "time_per_iteration": 3.0103371143341064 + }, + { + "auxiliary_loss_clip": 0.01136333, + "auxiliary_loss_mlp": 0.01080509, + "balance_loss_clip": 1.03661358, + "balance_loss_mlp": 1.00078154, + "epoch": 0.32164973245956835, + "flos": 63863250549120.0, + "grad_norm": 0.8097101102080579, + "language_loss": 0.55259365, + "learning_rate": 3.1722910489982586e-06, + "loss": 0.57476211, + "num_input_tokens_seen": 57652575, + "step": 2675, + "time_per_iteration": 3.3213210105895996 + }, + { + "auxiliary_loss_clip": 0.01132809, + "auxiliary_loss_mlp": 0.01086681, + "balance_loss_clip": 1.03392744, + "balance_loss_mlp": 1.00537992, + "epoch": 0.3217699753502074, + "flos": 23513948363520.0, + "grad_norm": 1.5607970776455966, + "language_loss": 0.8023296, + "learning_rate": 3.1716598327565694e-06, + "loss": 0.82452452, + "num_input_tokens_seen": 57672215, + "step": 2676, + "time_per_iteration": 2.7670774459838867 + }, + { + "auxiliary_loss_clip": 0.0114809, + "auxiliary_loss_mlp": 0.01087192, + "balance_loss_clip": 1.03332543, + "balance_loss_mlp": 1.00598669, + "epoch": 0.3218902182408465, + "flos": 19062533640960.0, + "grad_norm": 1.4879722148034107, + "language_loss": 0.84269583, + "learning_rate": 3.171028438775015e-06, + "loss": 0.86504865, + "num_input_tokens_seen": 57691410, + "step": 2677, + "time_per_iteration": 2.630315065383911 + }, + { + "auxiliary_loss_clip": 0.01148561, + "auxiliary_loss_mlp": 0.01087926, + "balance_loss_clip": 1.03373456, + "balance_loss_mlp": 1.00662518, + "epoch": 0.3220104611314856, + "flos": 20375571306240.0, + "grad_norm": 2.5535420423754283, + "language_loss": 0.84051728, + "learning_rate": 3.170396867149377e-06, + "loss": 0.8628822, + "num_input_tokens_seen": 57709415, + "step": 2678, + "time_per_iteration": 2.76885986328125 + }, + { + "auxiliary_loss_clip": 0.01105171, + "auxiliary_loss_mlp": 0.01087443, + "balance_loss_clip": 1.02757192, + "balance_loss_mlp": 1.00609422, + "epoch": 0.3221307040221247, + "flos": 20117014231680.0, + "grad_norm": 1.7232731720487615, + "language_loss": 0.86761796, + "learning_rate": 3.1697651179754653e-06, + "loss": 0.88954413, + "num_input_tokens_seen": 57728075, + "step": 2679, + "time_per_iteration": 2.791046380996704 + }, + { + "auxiliary_loss_clip": 0.01116573, + "auxiliary_loss_mlp": 0.01087677, + "balance_loss_clip": 1.03047526, + "balance_loss_mlp": 1.00642419, + "epoch": 0.3222509469127638, + "flos": 23987789602560.0, + "grad_norm": 1.779972291412886, + "language_loss": 0.72747362, + "learning_rate": 3.1691331913491153e-06, + "loss": 0.74951613, + "num_input_tokens_seen": 57750645, + "step": 2680, + "time_per_iteration": 2.8470780849456787 + }, + { + "auxiliary_loss_clip": 0.01149076, + "auxiliary_loss_mlp": 0.01087136, + "balance_loss_clip": 1.03442144, + "balance_loss_mlp": 1.00583577, + "epoch": 0.32237118980340285, + "flos": 17675735397120.0, + "grad_norm": 1.9846083712799434, + "language_loss": 0.84676605, + "learning_rate": 3.1685010873661898e-06, + "loss": 0.86912823, + "num_input_tokens_seen": 57769820, + "step": 2681, + "time_per_iteration": 3.6012558937072754 + }, + { + "auxiliary_loss_clip": 0.01141177, + "auxiliary_loss_mlp": 0.0108823, + "balance_loss_clip": 1.03388739, + "balance_loss_mlp": 1.00683367, + "epoch": 0.32249143269404196, + "flos": 23147982645120.0, + "grad_norm": 1.7931750081793083, + "language_loss": 0.79744297, + "learning_rate": 3.167868806122578e-06, + "loss": 0.81973702, + "num_input_tokens_seen": 57788870, + "step": 2682, + "time_per_iteration": 2.7258353233337402 + }, + { + "auxiliary_loss_clip": 0.01128157, + "auxiliary_loss_mlp": 0.01089143, + "balance_loss_clip": 1.03098941, + "balance_loss_mlp": 1.00769877, + "epoch": 0.32261167558468107, + "flos": 24422308427520.0, + "grad_norm": 1.853875617345374, + "language_loss": 0.66133702, + "learning_rate": 3.1672363477141968e-06, + "loss": 0.68350995, + "num_input_tokens_seen": 57808165, + "step": 2683, + "time_per_iteration": 2.765901803970337 + }, + { + "auxiliary_loss_clip": 0.01116171, + "auxiliary_loss_mlp": 0.01086934, + "balance_loss_clip": 1.03218269, + "balance_loss_mlp": 1.00553787, + "epoch": 0.3227319184753201, + "flos": 30367175852160.0, + "grad_norm": 2.268568501372272, + "language_loss": 0.84875524, + "learning_rate": 3.1666037122369903e-06, + "loss": 0.87078625, + "num_input_tokens_seen": 57828825, + "step": 2684, + "time_per_iteration": 2.8714206218719482 + }, + { + "auxiliary_loss_clip": 0.01139422, + "auxiliary_loss_mlp": 0.01088051, + "balance_loss_clip": 1.03246284, + "balance_loss_mlp": 1.0067023, + "epoch": 0.32285216136595923, + "flos": 16946174257920.0, + "grad_norm": 2.5302139621042223, + "language_loss": 0.8657884, + "learning_rate": 3.165970899786928e-06, + "loss": 0.88806313, + "num_input_tokens_seen": 57846740, + "step": 2685, + "time_per_iteration": 2.7508623600006104 + }, + { + "auxiliary_loss_clip": 0.01115206, + "auxiliary_loss_mlp": 0.01089461, + "balance_loss_clip": 1.02794647, + "balance_loss_mlp": 1.00811267, + "epoch": 0.32297240425659834, + "flos": 21981532383360.0, + "grad_norm": 1.7457619090246699, + "language_loss": 0.75589097, + "learning_rate": 3.1653379104600067e-06, + "loss": 0.77793771, + "num_input_tokens_seen": 57866885, + "step": 2686, + "time_per_iteration": 2.8729164600372314 + }, + { + "auxiliary_loss_clip": 0.01141378, + "auxiliary_loss_mlp": 0.01086495, + "balance_loss_clip": 1.03455853, + "balance_loss_mlp": 1.00528967, + "epoch": 0.3230926471472374, + "flos": 22748045639040.0, + "grad_norm": 1.313152504162452, + "language_loss": 0.69039702, + "learning_rate": 3.164704744352251e-06, + "loss": 0.71267581, + "num_input_tokens_seen": 57887690, + "step": 2687, + "time_per_iteration": 3.7123920917510986 + }, + { + "auxiliary_loss_clip": 0.01136367, + "auxiliary_loss_mlp": 0.01088187, + "balance_loss_clip": 1.03110003, + "balance_loss_mlp": 1.00693429, + "epoch": 0.3232128900378765, + "flos": 16942977947520.0, + "grad_norm": 1.6534085527745452, + "language_loss": 0.80657279, + "learning_rate": 3.164071401559713e-06, + "loss": 0.82881832, + "num_input_tokens_seen": 57905090, + "step": 2688, + "time_per_iteration": 4.13226318359375 + }, + { + "auxiliary_loss_clip": 0.01130233, + "auxiliary_loss_mlp": 0.01087347, + "balance_loss_clip": 1.03274775, + "balance_loss_mlp": 1.0060935, + "epoch": 0.3233331329285156, + "flos": 24023736138240.0, + "grad_norm": 1.6197231536131709, + "language_loss": 0.70957005, + "learning_rate": 3.1634378821784674e-06, + "loss": 0.7317459, + "num_input_tokens_seen": 57925305, + "step": 2689, + "time_per_iteration": 2.8003036975860596 + }, + { + "auxiliary_loss_clip": 0.01100841, + "auxiliary_loss_mlp": 0.01088169, + "balance_loss_clip": 1.02856231, + "balance_loss_mlp": 1.00686824, + "epoch": 0.3234533758191547, + "flos": 18113845582080.0, + "grad_norm": 2.150713180790207, + "language_loss": 0.73749769, + "learning_rate": 3.1628041863046208e-06, + "loss": 0.75938785, + "num_input_tokens_seen": 57942720, + "step": 2690, + "time_per_iteration": 3.7121970653533936 + }, + { + "auxiliary_loss_clip": 0.01148588, + "auxiliary_loss_mlp": 0.01086533, + "balance_loss_clip": 1.03294754, + "balance_loss_mlp": 1.00508881, + "epoch": 0.3235736187097938, + "flos": 16946138344320.0, + "grad_norm": 3.5103812288487704, + "language_loss": 0.91026628, + "learning_rate": 3.162170314034304e-06, + "loss": 0.93261749, + "num_input_tokens_seen": 57960135, + "step": 2691, + "time_per_iteration": 2.6892623901367188 + }, + { + "auxiliary_loss_clip": 0.01149346, + "auxiliary_loss_mlp": 0.01088827, + "balance_loss_clip": 1.03425896, + "balance_loss_mlp": 1.00747859, + "epoch": 0.3236938616004329, + "flos": 22127150119680.0, + "grad_norm": 1.688009498526598, + "language_loss": 0.80732942, + "learning_rate": 3.1615362654636738e-06, + "loss": 0.82971114, + "num_input_tokens_seen": 57980875, + "step": 2692, + "time_per_iteration": 2.752326011657715 + }, + { + "auxiliary_loss_clip": 0.01110857, + "auxiliary_loss_mlp": 0.01088668, + "balance_loss_clip": 1.02936196, + "balance_loss_mlp": 1.00751066, + "epoch": 0.32381410449107195, + "flos": 17164618819200.0, + "grad_norm": 1.755017075273246, + "language_loss": 0.87155461, + "learning_rate": 3.1609020406889163e-06, + "loss": 0.8935498, + "num_input_tokens_seen": 57998310, + "step": 2693, + "time_per_iteration": 2.8826942443847656 + }, + { + "auxiliary_loss_clip": 0.01131583, + "auxiliary_loss_mlp": 0.01088861, + "balance_loss_clip": 1.03254533, + "balance_loss_mlp": 1.00756013, + "epoch": 0.32393434738171106, + "flos": 16578125550720.0, + "grad_norm": 1.6617149764099772, + "language_loss": 0.84851313, + "learning_rate": 3.1602676398062416e-06, + "loss": 0.87071753, + "num_input_tokens_seen": 58017220, + "step": 2694, + "time_per_iteration": 2.7025537490844727 + }, + { + "auxiliary_loss_clip": 0.01137274, + "auxiliary_loss_mlp": 0.0108568, + "balance_loss_clip": 1.03178251, + "balance_loss_mlp": 1.00433195, + "epoch": 0.3240545902723502, + "flos": 25483612602240.0, + "grad_norm": 1.9419399987901187, + "language_loss": 0.61373496, + "learning_rate": 3.1596330629118886e-06, + "loss": 0.63596451, + "num_input_tokens_seen": 58037190, + "step": 2695, + "time_per_iteration": 2.761592149734497 + }, + { + "auxiliary_loss_clip": 0.01108579, + "auxiliary_loss_mlp": 0.01087035, + "balance_loss_clip": 1.02905226, + "balance_loss_mlp": 1.00573468, + "epoch": 0.32417483316298923, + "flos": 35845851634560.0, + "grad_norm": 1.8656320934979804, + "language_loss": 0.73143631, + "learning_rate": 3.1589983101021223e-06, + "loss": 0.7533924, + "num_input_tokens_seen": 58055820, + "step": 2696, + "time_per_iteration": 3.020005941390991 + }, + { + "auxiliary_loss_clip": 0.01114492, + "auxiliary_loss_mlp": 0.0108725, + "balance_loss_clip": 1.03182268, + "balance_loss_mlp": 1.00594902, + "epoch": 0.32429507605362834, + "flos": 30080501406720.0, + "grad_norm": 1.9264830966065325, + "language_loss": 0.84641254, + "learning_rate": 3.1583633814732337e-06, + "loss": 0.8684299, + "num_input_tokens_seen": 58075340, + "step": 2697, + "time_per_iteration": 2.7900805473327637 + }, + { + "auxiliary_loss_clip": 0.01147388, + "auxiliary_loss_mlp": 0.01087379, + "balance_loss_clip": 1.03231013, + "balance_loss_mlp": 1.00603068, + "epoch": 0.3244153189442674, + "flos": 18223265387520.0, + "grad_norm": 2.2205884751182596, + "language_loss": 0.71863353, + "learning_rate": 3.157728277121541e-06, + "loss": 0.74098122, + "num_input_tokens_seen": 58093515, + "step": 2698, + "time_per_iteration": 2.7456657886505127 + }, + { + "auxiliary_loss_clip": 0.01147977, + "auxiliary_loss_mlp": 0.01087824, + "balance_loss_clip": 1.03283453, + "balance_loss_mlp": 1.00642765, + "epoch": 0.3245355618349065, + "flos": 17710317216000.0, + "grad_norm": 3.522488367309251, + "language_loss": 0.78296858, + "learning_rate": 3.1570929971433897e-06, + "loss": 0.80532658, + "num_input_tokens_seen": 58109300, + "step": 2699, + "time_per_iteration": 2.6851999759674072 + }, + { + "auxiliary_loss_clip": 0.01133026, + "auxiliary_loss_mlp": 0.01088573, + "balance_loss_clip": 1.03288484, + "balance_loss_mlp": 1.007177, + "epoch": 0.3246558047255456, + "flos": 23440798316160.0, + "grad_norm": 1.7862933028191843, + "language_loss": 0.83594584, + "learning_rate": 3.1564575416351504e-06, + "loss": 0.85816187, + "num_input_tokens_seen": 58128000, + "step": 2700, + "time_per_iteration": 2.8030953407287598 + }, + { + "auxiliary_loss_clip": 0.01149792, + "auxiliary_loss_mlp": 0.01087317, + "balance_loss_clip": 1.03499854, + "balance_loss_mlp": 1.00606394, + "epoch": 0.32477604761618467, + "flos": 21760861178880.0, + "grad_norm": 2.1638837210804254, + "language_loss": 0.74167514, + "learning_rate": 3.155821910693221e-06, + "loss": 0.76404619, + "num_input_tokens_seen": 58147415, + "step": 2701, + "time_per_iteration": 2.6642537117004395 + }, + { + "auxiliary_loss_clip": 0.01129656, + "auxiliary_loss_mlp": 0.01087063, + "balance_loss_clip": 1.03216529, + "balance_loss_mlp": 1.00576186, + "epoch": 0.3248962905068238, + "flos": 19828328624640.0, + "grad_norm": 1.947311126438049, + "language_loss": 0.86248887, + "learning_rate": 3.1551861044140275e-06, + "loss": 0.88465607, + "num_input_tokens_seen": 58167050, + "step": 2702, + "time_per_iteration": 2.8248131275177 + }, + { + "auxiliary_loss_clip": 0.01110039, + "auxiliary_loss_mlp": 0.01089342, + "balance_loss_clip": 1.03078294, + "balance_loss_mlp": 1.00813627, + "epoch": 0.3250165333974629, + "flos": 23948215793280.0, + "grad_norm": 1.5871402932917051, + "language_loss": 0.77512902, + "learning_rate": 3.15455012289402e-06, + "loss": 0.79712284, + "num_input_tokens_seen": 58186695, + "step": 2703, + "time_per_iteration": 2.8297810554504395 + }, + { + "auxiliary_loss_clip": 0.0113763, + "auxiliary_loss_mlp": 0.01088312, + "balance_loss_clip": 1.03309357, + "balance_loss_mlp": 1.00691581, + "epoch": 0.32513677628810195, + "flos": 23989333887360.0, + "grad_norm": 1.6944209911701946, + "language_loss": 0.84286833, + "learning_rate": 3.153913966229677e-06, + "loss": 0.8651278, + "num_input_tokens_seen": 58205815, + "step": 2704, + "time_per_iteration": 2.7409512996673584 + }, + { + "auxiliary_loss_clip": 0.01129798, + "auxiliary_loss_mlp": 0.01080035, + "balance_loss_clip": 1.03808784, + "balance_loss_mlp": 1.00030828, + "epoch": 0.32525701917874106, + "flos": 70655790009600.0, + "grad_norm": 0.6370030459419529, + "language_loss": 0.50296271, + "learning_rate": 3.1532776345175027e-06, + "loss": 0.52506107, + "num_input_tokens_seen": 58270960, + "step": 2705, + "time_per_iteration": 3.2374987602233887 + }, + { + "auxiliary_loss_clip": 0.01148451, + "auxiliary_loss_mlp": 0.01086401, + "balance_loss_clip": 1.03399789, + "balance_loss_mlp": 1.00519562, + "epoch": 0.32537726206938017, + "flos": 19682639061120.0, + "grad_norm": 1.6885050936118382, + "language_loss": 0.78407663, + "learning_rate": 3.1526411278540285e-06, + "loss": 0.80642509, + "num_input_tokens_seen": 58289390, + "step": 2706, + "time_per_iteration": 3.5188710689544678 + }, + { + "auxiliary_loss_clip": 0.0113257, + "auxiliary_loss_mlp": 0.0108815, + "balance_loss_clip": 1.03366375, + "balance_loss_mlp": 1.00665879, + "epoch": 0.3254975049600192, + "flos": 28760999293440.0, + "grad_norm": 2.3883645056846743, + "language_loss": 0.81566107, + "learning_rate": 3.1520044463358116e-06, + "loss": 0.83786821, + "num_input_tokens_seen": 58306120, + "step": 2707, + "time_per_iteration": 2.8016698360443115 + }, + { + "auxiliary_loss_clip": 0.01138432, + "auxiliary_loss_mlp": 0.01087151, + "balance_loss_clip": 1.03345835, + "balance_loss_mlp": 1.00599349, + "epoch": 0.32561774785065833, + "flos": 18877378008960.0, + "grad_norm": 1.4954984065188373, + "language_loss": 0.80152047, + "learning_rate": 3.151367590059436e-06, + "loss": 0.8237763, + "num_input_tokens_seen": 58324545, + "step": 2708, + "time_per_iteration": 2.690120220184326 + }, + { + "auxiliary_loss_clip": 0.01149089, + "auxiliary_loss_mlp": 0.00873815, + "balance_loss_clip": 1.03429508, + "balance_loss_mlp": 1.00034952, + "epoch": 0.32573799074129745, + "flos": 23112107936640.0, + "grad_norm": 1.9198021915703578, + "language_loss": 0.86568588, + "learning_rate": 3.1507305591215117e-06, + "loss": 0.88591492, + "num_input_tokens_seen": 58342455, + "step": 2709, + "time_per_iteration": 2.727999448776245 + }, + { + "auxiliary_loss_clip": 0.01129506, + "auxiliary_loss_mlp": 0.01079869, + "balance_loss_clip": 1.03787088, + "balance_loss_mlp": 1.00014234, + "epoch": 0.3258582336319365, + "flos": 71237650423680.0, + "grad_norm": 0.6962635849395157, + "language_loss": 0.55759001, + "learning_rate": 3.150093353618677e-06, + "loss": 0.57968378, + "num_input_tokens_seen": 58407185, + "step": 2710, + "time_per_iteration": 3.3159096240997314 + }, + { + "auxiliary_loss_clip": 0.01141569, + "auxiliary_loss_mlp": 0.0108832, + "balance_loss_clip": 1.03379941, + "balance_loss_mlp": 1.00697124, + "epoch": 0.3259784765225756, + "flos": 22456020067200.0, + "grad_norm": 2.3525558515510614, + "language_loss": 0.88268906, + "learning_rate": 3.149455973647596e-06, + "loss": 0.90498787, + "num_input_tokens_seen": 58425245, + "step": 2711, + "time_per_iteration": 2.641263723373413 + }, + { + "auxiliary_loss_clip": 0.0112088, + "auxiliary_loss_mlp": 0.01088111, + "balance_loss_clip": 1.03097439, + "balance_loss_mlp": 1.00680995, + "epoch": 0.32609871941321467, + "flos": 20484811543680.0, + "grad_norm": 1.8983732672842244, + "language_loss": 0.77135432, + "learning_rate": 3.1488184193049563e-06, + "loss": 0.79344422, + "num_input_tokens_seen": 58444780, + "step": 2712, + "time_per_iteration": 3.645979642868042 + }, + { + "auxiliary_loss_clip": 0.01149983, + "auxiliary_loss_mlp": 0.01086384, + "balance_loss_clip": 1.03528678, + "balance_loss_mlp": 1.00517857, + "epoch": 0.3262189623038538, + "flos": 22416805393920.0, + "grad_norm": 1.593913393729707, + "language_loss": 0.7203846, + "learning_rate": 3.1481806906874767e-06, + "loss": 0.74274832, + "num_input_tokens_seen": 58466090, + "step": 2713, + "time_per_iteration": 3.6680521965026855 + }, + { + "auxiliary_loss_clip": 0.01148816, + "auxiliary_loss_mlp": 0.01089254, + "balance_loss_clip": 1.034042, + "balance_loss_mlp": 1.00804853, + "epoch": 0.3263392051944929, + "flos": 20923496346240.0, + "grad_norm": 1.5467766861468981, + "language_loss": 0.87578332, + "learning_rate": 3.147542787891899e-06, + "loss": 0.89816397, + "num_input_tokens_seen": 58485435, + "step": 2714, + "time_per_iteration": 2.604111433029175 + }, + { + "auxiliary_loss_clip": 0.01111563, + "auxiliary_loss_mlp": 0.01086651, + "balance_loss_clip": 1.03012884, + "balance_loss_mlp": 1.00530243, + "epoch": 0.32645944808513194, + "flos": 24025172682240.0, + "grad_norm": 1.7991305838606932, + "language_loss": 0.75044239, + "learning_rate": 3.1469047110149926e-06, + "loss": 0.77242452, + "num_input_tokens_seen": 58504175, + "step": 2715, + "time_per_iteration": 3.6717910766601562 + }, + { + "auxiliary_loss_clip": 0.01105914, + "auxiliary_loss_mlp": 0.01089586, + "balance_loss_clip": 1.02796245, + "balance_loss_mlp": 1.0080471, + "epoch": 0.32657969097577105, + "flos": 21032413361280.0, + "grad_norm": 1.831602660608988, + "language_loss": 0.85003996, + "learning_rate": 3.146266460153554e-06, + "loss": 0.87199497, + "num_input_tokens_seen": 58523885, + "step": 2716, + "time_per_iteration": 2.778143882751465 + }, + { + "auxiliary_loss_clip": 0.01132484, + "auxiliary_loss_mlp": 0.00873776, + "balance_loss_clip": 1.03458929, + "balance_loss_mlp": 1.00036347, + "epoch": 0.32669993386641016, + "flos": 22710267509760.0, + "grad_norm": 1.9909074913311218, + "language_loss": 0.80092627, + "learning_rate": 3.145628035404404e-06, + "loss": 0.82098889, + "num_input_tokens_seen": 58543085, + "step": 2717, + "time_per_iteration": 2.7415895462036133 + }, + { + "auxiliary_loss_clip": 0.01129619, + "auxiliary_loss_mlp": 0.01080073, + "balance_loss_clip": 1.03817391, + "balance_loss_mlp": 1.00034618, + "epoch": 0.3268201767570492, + "flos": 72105718406400.0, + "grad_norm": 0.8851954584101903, + "language_loss": 0.57505375, + "learning_rate": 3.1449894368643922e-06, + "loss": 0.59715068, + "num_input_tokens_seen": 58605400, + "step": 2718, + "time_per_iteration": 3.3270864486694336 + }, + { + "auxiliary_loss_clip": 0.01120242, + "auxiliary_loss_mlp": 0.01087372, + "balance_loss_clip": 1.03252196, + "balance_loss_mlp": 1.00616693, + "epoch": 0.32694041964768833, + "flos": 24535175938560.0, + "grad_norm": 1.5109960917072165, + "language_loss": 0.71612442, + "learning_rate": 3.1443506646303934e-06, + "loss": 0.73820066, + "num_input_tokens_seen": 58626700, + "step": 2719, + "time_per_iteration": 2.767124652862549 + }, + { + "auxiliary_loss_clip": 0.01141576, + "auxiliary_loss_mlp": 0.01086582, + "balance_loss_clip": 1.03472829, + "balance_loss_mlp": 1.00542402, + "epoch": 0.32706066253832744, + "flos": 33183003755520.0, + "grad_norm": 2.2725495263377087, + "language_loss": 0.67117172, + "learning_rate": 3.1437117187993086e-06, + "loss": 0.69345331, + "num_input_tokens_seen": 58649020, + "step": 2720, + "time_per_iteration": 2.8465070724487305 + }, + { + "auxiliary_loss_clip": 0.01115176, + "auxiliary_loss_mlp": 0.01087197, + "balance_loss_clip": 1.03112459, + "balance_loss_mlp": 1.00603902, + "epoch": 0.3271809054289665, + "flos": 24061622008320.0, + "grad_norm": 1.6164107373767318, + "language_loss": 0.79792023, + "learning_rate": 3.143072599468065e-06, + "loss": 0.81994402, + "num_input_tokens_seen": 58668845, + "step": 2721, + "time_per_iteration": 2.8970723152160645 + }, + { + "auxiliary_loss_clip": 0.01126909, + "auxiliary_loss_mlp": 0.01088865, + "balance_loss_clip": 1.03149652, + "balance_loss_mlp": 1.00751686, + "epoch": 0.3273011483196056, + "flos": 38253769712640.0, + "grad_norm": 1.4951937006963272, + "language_loss": 0.75836432, + "learning_rate": 3.1424333067336174e-06, + "loss": 0.78052211, + "num_input_tokens_seen": 58691610, + "step": 2722, + "time_per_iteration": 2.877423048019409 + }, + { + "auxiliary_loss_clip": 0.01140285, + "auxiliary_loss_mlp": 0.01088781, + "balance_loss_clip": 1.03332663, + "balance_loss_mlp": 1.00738502, + "epoch": 0.3274213912102447, + "flos": 29054389582080.0, + "grad_norm": 1.7857736463706535, + "language_loss": 0.78580397, + "learning_rate": 3.141793840692945e-06, + "loss": 0.80809462, + "num_input_tokens_seen": 58712360, + "step": 2723, + "time_per_iteration": 2.7867116928100586 + }, + { + "auxiliary_loss_clip": 0.0112835, + "auxiliary_loss_mlp": 0.0108936, + "balance_loss_clip": 1.03166223, + "balance_loss_mlp": 1.00801182, + "epoch": 0.32754163410088377, + "flos": 29133249891840.0, + "grad_norm": 2.210954800890578, + "language_loss": 0.610237, + "learning_rate": 3.1411542014430553e-06, + "loss": 0.6324141, + "num_input_tokens_seen": 58733440, + "step": 2724, + "time_per_iteration": 2.829066514968872 + }, + { + "auxiliary_loss_clip": 0.01121336, + "auxiliary_loss_mlp": 0.01087645, + "balance_loss_clip": 1.03138852, + "balance_loss_mlp": 1.00639248, + "epoch": 0.3276618769915229, + "flos": 20631075724800.0, + "grad_norm": 2.668875167862137, + "language_loss": 0.81740135, + "learning_rate": 3.1405143890809804e-06, + "loss": 0.83949113, + "num_input_tokens_seen": 58752735, + "step": 2725, + "time_per_iteration": 2.8994436264038086 + }, + { + "auxiliary_loss_clip": 0.01129698, + "auxiliary_loss_mlp": 0.0108723, + "balance_loss_clip": 1.03314972, + "balance_loss_mlp": 1.00597668, + "epoch": 0.327782119882162, + "flos": 18657425076480.0, + "grad_norm": 1.8241557678055773, + "language_loss": 0.70008671, + "learning_rate": 3.1398744037037796e-06, + "loss": 0.72225595, + "num_input_tokens_seen": 58772070, + "step": 2726, + "time_per_iteration": 2.7007253170013428 + }, + { + "auxiliary_loss_clip": 0.01126986, + "auxiliary_loss_mlp": 0.01087359, + "balance_loss_clip": 1.03143978, + "balance_loss_mlp": 1.00615335, + "epoch": 0.32790236277280105, + "flos": 21795802133760.0, + "grad_norm": 1.860992739665805, + "language_loss": 0.8390826, + "learning_rate": 3.139234245408538e-06, + "loss": 0.86122602, + "num_input_tokens_seen": 58790950, + "step": 2727, + "time_per_iteration": 2.7771124839782715 + }, + { + "auxiliary_loss_clip": 0.01121232, + "auxiliary_loss_mlp": 0.00873651, + "balance_loss_clip": 1.03346276, + "balance_loss_mlp": 1.00031281, + "epoch": 0.32802260566344016, + "flos": 23331414424320.0, + "grad_norm": 1.7184190482373856, + "language_loss": 0.75998235, + "learning_rate": 3.1385939142923666e-06, + "loss": 0.77993119, + "num_input_tokens_seen": 58813340, + "step": 2728, + "time_per_iteration": 2.8114986419677734 + }, + { + "auxiliary_loss_clip": 0.01128678, + "auxiliary_loss_mlp": 0.01087444, + "balance_loss_clip": 1.03176129, + "balance_loss_mlp": 1.00600052, + "epoch": 0.3281428485540792, + "flos": 24206988349440.0, + "grad_norm": 1.9005710393595503, + "language_loss": 0.78268099, + "learning_rate": 3.137953410452405e-06, + "loss": 0.80484223, + "num_input_tokens_seen": 58833610, + "step": 2729, + "time_per_iteration": 2.728942632675171 + }, + { + "auxiliary_loss_clip": 0.01126815, + "auxiliary_loss_mlp": 0.01087718, + "balance_loss_clip": 1.03060782, + "balance_loss_mlp": 1.00641704, + "epoch": 0.3282630914447183, + "flos": 34128962380800.0, + "grad_norm": 1.7417818638890041, + "language_loss": 0.74165881, + "learning_rate": 3.1373127339858146e-06, + "loss": 0.7638042, + "num_input_tokens_seen": 58856210, + "step": 2730, + "time_per_iteration": 2.8583836555480957 + }, + { + "auxiliary_loss_clip": 0.0112336, + "auxiliary_loss_mlp": 0.01088922, + "balance_loss_clip": 1.03389132, + "balance_loss_mlp": 1.00766909, + "epoch": 0.32838333433535744, + "flos": 27600726170880.0, + "grad_norm": 1.9918505803122994, + "language_loss": 0.74507034, + "learning_rate": 3.136671884989787e-06, + "loss": 0.7671932, + "num_input_tokens_seen": 58876120, + "step": 2731, + "time_per_iteration": 2.831667184829712 + }, + { + "auxiliary_loss_clip": 0.01086933, + "auxiliary_loss_mlp": 0.01087268, + "balance_loss_clip": 1.03074598, + "balance_loss_mlp": 1.00620615, + "epoch": 0.3285035772259965, + "flos": 12349500935040.0, + "grad_norm": 2.3030830572206695, + "language_loss": 0.87271345, + "learning_rate": 3.1360308635615383e-06, + "loss": 0.89445549, + "num_input_tokens_seen": 58894660, + "step": 2732, + "time_per_iteration": 3.798379898071289 + }, + { + "auxiliary_loss_clip": 0.01131247, + "auxiliary_loss_mlp": 0.01087609, + "balance_loss_clip": 1.03352726, + "balance_loss_mlp": 1.00621283, + "epoch": 0.3286238201166356, + "flos": 24316084932480.0, + "grad_norm": 2.0194143492838323, + "language_loss": 0.78321457, + "learning_rate": 3.135389669798311e-06, + "loss": 0.80540305, + "num_input_tokens_seen": 58912720, + "step": 2733, + "time_per_iteration": 2.7951221466064453 + }, + { + "auxiliary_loss_clip": 0.01140056, + "auxiliary_loss_mlp": 0.00873663, + "balance_loss_clip": 1.03331161, + "balance_loss_mlp": 1.0004009, + "epoch": 0.3287440630072747, + "flos": 21392812471680.0, + "grad_norm": 1.7873524974504809, + "language_loss": 0.79746139, + "learning_rate": 3.134748303797373e-06, + "loss": 0.81759864, + "num_input_tokens_seen": 58930090, + "step": 2734, + "time_per_iteration": 2.7397396564483643 + }, + { + "auxiliary_loss_clip": 0.01097809, + "auxiliary_loss_mlp": 0.010869, + "balance_loss_clip": 1.031358, + "balance_loss_mlp": 1.005409, + "epoch": 0.32886430589791377, + "flos": 23732536579200.0, + "grad_norm": 3.314415002972861, + "language_loss": 0.80972099, + "learning_rate": 3.1341067656560203e-06, + "loss": 0.83156812, + "num_input_tokens_seen": 58947935, + "step": 2735, + "time_per_iteration": 2.8293159008026123 + }, + { + "auxiliary_loss_clip": 0.01130058, + "auxiliary_loss_mlp": 0.01088352, + "balance_loss_clip": 1.03147364, + "balance_loss_mlp": 1.00695562, + "epoch": 0.3289845487885529, + "flos": 22418708814720.0, + "grad_norm": 4.616991933736368, + "language_loss": 0.86235678, + "learning_rate": 3.133465055471572e-06, + "loss": 0.88454086, + "num_input_tokens_seen": 58967720, + "step": 2736, + "time_per_iteration": 2.7315750122070312 + }, + { + "auxiliary_loss_clip": 0.0112181, + "auxiliary_loss_mlp": 0.01087712, + "balance_loss_clip": 1.03457022, + "balance_loss_mlp": 1.00645852, + "epoch": 0.329104791679192, + "flos": 19682603147520.0, + "grad_norm": 2.408676291607936, + "language_loss": 0.66375566, + "learning_rate": 3.1328231733413767e-06, + "loss": 0.68585086, + "num_input_tokens_seen": 58984360, + "step": 2737, + "time_per_iteration": 2.765244722366333 + }, + { + "auxiliary_loss_clip": 0.01140672, + "auxiliary_loss_mlp": 0.01087176, + "balance_loss_clip": 1.03407288, + "balance_loss_mlp": 1.00582767, + "epoch": 0.32922503456983104, + "flos": 15997234803840.0, + "grad_norm": 2.0687316418262105, + "language_loss": 0.9113071, + "learning_rate": 3.1321811193628067e-06, + "loss": 0.93358558, + "num_input_tokens_seen": 59002505, + "step": 2738, + "time_per_iteration": 3.6481575965881348 + }, + { + "auxiliary_loss_clip": 0.01137764, + "auxiliary_loss_mlp": 0.00873791, + "balance_loss_clip": 1.03348196, + "balance_loss_mlp": 1.00035095, + "epoch": 0.32934527746047015, + "flos": 26834069260800.0, + "grad_norm": 4.54509795671435, + "language_loss": 0.70234299, + "learning_rate": 3.131538893633261e-06, + "loss": 0.72245848, + "num_input_tokens_seen": 59022065, + "step": 2739, + "time_per_iteration": 3.7581982612609863 + }, + { + "auxiliary_loss_clip": 0.0115064, + "auxiliary_loss_mlp": 0.01086305, + "balance_loss_clip": 1.03610182, + "balance_loss_mlp": 1.00514698, + "epoch": 0.32946552035110926, + "flos": 23403774372480.0, + "grad_norm": 2.1952394530751578, + "language_loss": 0.77656537, + "learning_rate": 3.130896496250165e-06, + "loss": 0.79893482, + "num_input_tokens_seen": 59041890, + "step": 2740, + "time_per_iteration": 2.6643965244293213 + }, + { + "auxiliary_loss_clip": 0.01148482, + "auxiliary_loss_mlp": 0.01086657, + "balance_loss_clip": 1.03339458, + "balance_loss_mlp": 1.00545144, + "epoch": 0.3295857632417483, + "flos": 14172470029440.0, + "grad_norm": 1.883976719643556, + "language_loss": 0.86686772, + "learning_rate": 3.1302539273109693e-06, + "loss": 0.88921916, + "num_input_tokens_seen": 59058715, + "step": 2741, + "time_per_iteration": 3.516953229904175 + }, + { + "auxiliary_loss_clip": 0.01130501, + "auxiliary_loss_mlp": 0.01087791, + "balance_loss_clip": 1.03285491, + "balance_loss_mlp": 1.00649071, + "epoch": 0.32970600613238743, + "flos": 22196708807040.0, + "grad_norm": 1.792960094190291, + "language_loss": 0.80332541, + "learning_rate": 3.1296111869131513e-06, + "loss": 0.82550836, + "num_input_tokens_seen": 59076140, + "step": 2742, + "time_per_iteration": 2.7719693183898926 + }, + { + "auxiliary_loss_clip": 0.01147967, + "auxiliary_loss_mlp": 0.01086695, + "balance_loss_clip": 1.0330584, + "balance_loss_mlp": 1.00558496, + "epoch": 0.32982624902302654, + "flos": 22053784590720.0, + "grad_norm": 1.756706236008454, + "language_loss": 0.85513777, + "learning_rate": 3.1289682751542153e-06, + "loss": 0.87748444, + "num_input_tokens_seen": 59095700, + "step": 2743, + "time_per_iteration": 2.6538500785827637 + }, + { + "auxiliary_loss_clip": 0.01142046, + "auxiliary_loss_mlp": 0.01087049, + "balance_loss_clip": 1.03515947, + "balance_loss_mlp": 1.0058918, + "epoch": 0.3299464919136656, + "flos": 18661626967680.0, + "grad_norm": 1.8901029869022468, + "language_loss": 0.71560055, + "learning_rate": 3.1283251921316883e-06, + "loss": 0.7378915, + "num_input_tokens_seen": 59113445, + "step": 2744, + "time_per_iteration": 2.671840190887451 + }, + { + "auxiliary_loss_clip": 0.01105674, + "auxiliary_loss_mlp": 0.01087608, + "balance_loss_clip": 1.0279932, + "balance_loss_mlp": 1.00640225, + "epoch": 0.3300667348043047, + "flos": 13407357404160.0, + "grad_norm": 1.967092589049114, + "language_loss": 0.80440223, + "learning_rate": 3.1276819379431277e-06, + "loss": 0.82633507, + "num_input_tokens_seen": 59131535, + "step": 2745, + "time_per_iteration": 2.7569055557250977 + }, + { + "auxiliary_loss_clip": 0.01118224, + "auxiliary_loss_mlp": 0.00873642, + "balance_loss_clip": 1.03398657, + "balance_loss_mlp": 1.00021338, + "epoch": 0.33018697769494376, + "flos": 15742556398080.0, + "grad_norm": 1.7506706433055383, + "language_loss": 0.75230598, + "learning_rate": 3.1270385126861134e-06, + "loss": 0.77222461, + "num_input_tokens_seen": 59149520, + "step": 2746, + "time_per_iteration": 2.782140016555786 + }, + { + "auxiliary_loss_clip": 0.0114928, + "auxiliary_loss_mlp": 0.0108912, + "balance_loss_clip": 1.03449607, + "balance_loss_mlp": 1.00781906, + "epoch": 0.3303072205855829, + "flos": 18258601392000.0, + "grad_norm": 1.7254532480874716, + "language_loss": 0.82036561, + "learning_rate": 3.1263949164582533e-06, + "loss": 0.84274954, + "num_input_tokens_seen": 59169170, + "step": 2747, + "time_per_iteration": 2.6685805320739746 + }, + { + "auxiliary_loss_clip": 0.01147502, + "auxiliary_loss_mlp": 0.01087906, + "balance_loss_clip": 1.03245068, + "balance_loss_mlp": 1.0065577, + "epoch": 0.330427463476222, + "flos": 17749424148480.0, + "grad_norm": 2.0799059235718262, + "language_loss": 0.78546804, + "learning_rate": 3.1257511493571797e-06, + "loss": 0.80782211, + "num_input_tokens_seen": 59187675, + "step": 2748, + "time_per_iteration": 2.6286509037017822 + }, + { + "auxiliary_loss_clip": 0.01119346, + "auxiliary_loss_mlp": 0.01085925, + "balance_loss_clip": 1.03124428, + "balance_loss_mlp": 1.00491023, + "epoch": 0.33054770636686104, + "flos": 27162580072320.0, + "grad_norm": 1.9264462778645604, + "language_loss": 0.78046453, + "learning_rate": 3.125107211480552e-06, + "loss": 0.80251729, + "num_input_tokens_seen": 59207610, + "step": 2749, + "time_per_iteration": 2.8439137935638428 + }, + { + "auxiliary_loss_clip": 0.01088468, + "auxiliary_loss_mlp": 0.01087841, + "balance_loss_clip": 1.03035545, + "balance_loss_mlp": 1.00668311, + "epoch": 0.33066794925750015, + "flos": 20117193799680.0, + "grad_norm": 1.5702075781231408, + "language_loss": 0.79439098, + "learning_rate": 3.124463102926054e-06, + "loss": 0.81615412, + "num_input_tokens_seen": 59226945, + "step": 2750, + "time_per_iteration": 2.8031082153320312 + }, + { + "auxiliary_loss_clip": 0.01129792, + "auxiliary_loss_mlp": 0.01080077, + "balance_loss_clip": 1.03765798, + "balance_loss_mlp": 1.00073135, + "epoch": 0.33078819214813926, + "flos": 70642609718400.0, + "grad_norm": 0.7667867845507993, + "language_loss": 0.61624128, + "learning_rate": 3.1238188237913984e-06, + "loss": 0.63834, + "num_input_tokens_seen": 59291485, + "step": 2751, + "time_per_iteration": 3.3623437881469727 + }, + { + "auxiliary_loss_clip": 0.01150473, + "auxiliary_loss_mlp": 0.01089217, + "balance_loss_clip": 1.03463745, + "balance_loss_mlp": 1.00767827, + "epoch": 0.3309084350387783, + "flos": 21141940907520.0, + "grad_norm": 2.1205294809330764, + "language_loss": 0.7641297, + "learning_rate": 3.1231743741743202e-06, + "loss": 0.78652656, + "num_input_tokens_seen": 59310990, + "step": 2752, + "time_per_iteration": 2.617480754852295 + }, + { + "auxiliary_loss_clip": 0.01139922, + "auxiliary_loss_mlp": 0.01086567, + "balance_loss_clip": 1.03317511, + "balance_loss_mlp": 1.00531363, + "epoch": 0.3310286779294174, + "flos": 14209350318720.0, + "grad_norm": 2.5619153772530883, + "language_loss": 0.83796978, + "learning_rate": 3.122529754172582e-06, + "loss": 0.86023462, + "num_input_tokens_seen": 59327875, + "step": 2753, + "time_per_iteration": 2.6755456924438477 + }, + { + "auxiliary_loss_clip": 0.01137587, + "auxiliary_loss_mlp": 0.01088521, + "balance_loss_clip": 1.03265285, + "balance_loss_mlp": 1.00722039, + "epoch": 0.33114892082005654, + "flos": 20778130005120.0, + "grad_norm": 1.9746768741963299, + "language_loss": 0.72138214, + "learning_rate": 3.1218849638839736e-06, + "loss": 0.74364322, + "num_input_tokens_seen": 59347135, + "step": 2754, + "time_per_iteration": 2.74064564704895 + }, + { + "auxiliary_loss_clip": 0.01123758, + "auxiliary_loss_mlp": 0.01087285, + "balance_loss_clip": 1.03293145, + "balance_loss_mlp": 1.00598395, + "epoch": 0.3312691637106956, + "flos": 17090750499840.0, + "grad_norm": 5.43132579326389, + "language_loss": 0.78159112, + "learning_rate": 3.121240003406307e-06, + "loss": 0.80370158, + "num_input_tokens_seen": 59365985, + "step": 2755, + "time_per_iteration": 2.7527966499328613 + }, + { + "auxiliary_loss_clip": 0.0111974, + "auxiliary_loss_mlp": 0.01087244, + "balance_loss_clip": 1.03190732, + "balance_loss_mlp": 1.00603914, + "epoch": 0.3313894066013347, + "flos": 29456230008960.0, + "grad_norm": 2.2726930435055346, + "language_loss": 0.72571409, + "learning_rate": 3.120594872837425e-06, + "loss": 0.74778396, + "num_input_tokens_seen": 59384655, + "step": 2756, + "time_per_iteration": 2.920961856842041 + }, + { + "auxiliary_loss_clip": 0.01133119, + "auxiliary_loss_mlp": 0.00873197, + "balance_loss_clip": 1.04154932, + "balance_loss_mlp": 1.00092626, + "epoch": 0.3315096494919738, + "flos": 61419242280960.0, + "grad_norm": 0.8311438816256822, + "language_loss": 0.62417328, + "learning_rate": 3.1199495722751906e-06, + "loss": 0.64423645, + "num_input_tokens_seen": 59444185, + "step": 2757, + "time_per_iteration": 4.121062278747559 + }, + { + "auxiliary_loss_clip": 0.01111033, + "auxiliary_loss_mlp": 0.01086802, + "balance_loss_clip": 1.03095305, + "balance_loss_mlp": 1.00554931, + "epoch": 0.33162989238261287, + "flos": 21653057485440.0, + "grad_norm": 1.7461673980303005, + "language_loss": 0.83710825, + "learning_rate": 3.1193041018174972e-06, + "loss": 0.85908663, + "num_input_tokens_seen": 59464900, + "step": 2758, + "time_per_iteration": 2.840956449508667 + }, + { + "auxiliary_loss_clip": 0.0114007, + "auxiliary_loss_mlp": 0.01087264, + "balance_loss_clip": 1.03387845, + "balance_loss_mlp": 1.00596356, + "epoch": 0.331750135273252, + "flos": 22674787850880.0, + "grad_norm": 2.2502050730958243, + "language_loss": 0.94490403, + "learning_rate": 3.118658461562261e-06, + "loss": 0.96717739, + "num_input_tokens_seen": 59481000, + "step": 2759, + "time_per_iteration": 2.6849911212921143 + }, + { + "auxiliary_loss_clip": 0.01128408, + "auxiliary_loss_mlp": 0.01087213, + "balance_loss_clip": 1.03166747, + "balance_loss_mlp": 1.00591242, + "epoch": 0.33187037816389103, + "flos": 22746896403840.0, + "grad_norm": 2.1915495353787384, + "language_loss": 0.84628773, + "learning_rate": 3.118012651607426e-06, + "loss": 0.86844391, + "num_input_tokens_seen": 59502605, + "step": 2760, + "time_per_iteration": 2.7023708820343018 + }, + { + "auxiliary_loss_clip": 0.0114961, + "auxiliary_loss_mlp": 0.01086867, + "balance_loss_clip": 1.03486586, + "balance_loss_mlp": 1.00566185, + "epoch": 0.33199062105453014, + "flos": 19203769918080.0, + "grad_norm": 2.0890026534550676, + "language_loss": 0.83340454, + "learning_rate": 3.1173666720509603e-06, + "loss": 0.85576934, + "num_input_tokens_seen": 59519540, + "step": 2761, + "time_per_iteration": 2.636115074157715 + }, + { + "auxiliary_loss_clip": 0.01131934, + "auxiliary_loss_mlp": 0.01087956, + "balance_loss_clip": 1.03358257, + "balance_loss_mlp": 1.00675046, + "epoch": 0.33211086394516925, + "flos": 31577006764800.0, + "grad_norm": 2.586940113642627, + "language_loss": 0.68330663, + "learning_rate": 3.116720522990859e-06, + "loss": 0.70550555, + "num_input_tokens_seen": 59540415, + "step": 2762, + "time_per_iteration": 2.7882909774780273 + }, + { + "auxiliary_loss_clip": 0.01096229, + "auxiliary_loss_mlp": 0.01087621, + "balance_loss_clip": 1.02702177, + "balance_loss_mlp": 1.00622523, + "epoch": 0.3322311068358083, + "flos": 17932496791680.0, + "grad_norm": 2.759402876985208, + "language_loss": 0.62392592, + "learning_rate": 3.116074204525142e-06, + "loss": 0.64576435, + "num_input_tokens_seen": 59558590, + "step": 2763, + "time_per_iteration": 2.8153562545776367 + }, + { + "auxiliary_loss_clip": 0.011412, + "auxiliary_loss_mlp": 0.01087411, + "balance_loss_clip": 1.03472483, + "balance_loss_mlp": 1.00615835, + "epoch": 0.3323513497264474, + "flos": 32269831269120.0, + "grad_norm": 1.5585628949458836, + "language_loss": 0.83703876, + "learning_rate": 3.1154277167518553e-06, + "loss": 0.85932487, + "num_input_tokens_seen": 59580205, + "step": 2764, + "time_per_iteration": 3.772042989730835 + }, + { + "auxiliary_loss_clip": 0.01123771, + "auxiliary_loss_mlp": 0.01079562, + "balance_loss_clip": 1.04083037, + "balance_loss_mlp": 1.00021589, + "epoch": 0.33247159261708653, + "flos": 52668674588160.0, + "grad_norm": 0.7872845517621329, + "language_loss": 0.59500134, + "learning_rate": 3.114781059769072e-06, + "loss": 0.61703461, + "num_input_tokens_seen": 59631530, + "step": 2765, + "time_per_iteration": 4.138250827789307 + }, + { + "auxiliary_loss_clip": 0.01128987, + "auxiliary_loss_mlp": 0.01087761, + "balance_loss_clip": 1.03295898, + "balance_loss_mlp": 1.00641251, + "epoch": 0.3325918355077256, + "flos": 27125232906240.0, + "grad_norm": 2.9406432524148665, + "language_loss": 0.67483807, + "learning_rate": 3.1141342336748874e-06, + "loss": 0.69700551, + "num_input_tokens_seen": 59651090, + "step": 2766, + "time_per_iteration": 3.680649995803833 + }, + { + "auxiliary_loss_clip": 0.01137472, + "auxiliary_loss_mlp": 0.01088759, + "balance_loss_clip": 1.03274012, + "balance_loss_mlp": 1.00760102, + "epoch": 0.3327120783983647, + "flos": 23664414435840.0, + "grad_norm": 1.5613368247185646, + "language_loss": 0.82124734, + "learning_rate": 3.1134872385674253e-06, + "loss": 0.84350967, + "num_input_tokens_seen": 59675245, + "step": 2767, + "time_per_iteration": 2.8345749378204346 + }, + { + "auxiliary_loss_clip": 0.01131205, + "auxiliary_loss_mlp": 0.01086166, + "balance_loss_clip": 1.03236485, + "balance_loss_mlp": 1.00505602, + "epoch": 0.3328323212890038, + "flos": 19171378828800.0, + "grad_norm": 1.7816270686532325, + "language_loss": 0.85273671, + "learning_rate": 3.1128400745448353e-06, + "loss": 0.87491047, + "num_input_tokens_seen": 59694625, + "step": 2768, + "time_per_iteration": 2.8327815532684326 + }, + { + "auxiliary_loss_clip": 0.01140381, + "auxiliary_loss_mlp": 0.01087742, + "balance_loss_clip": 1.03417683, + "balance_loss_mlp": 1.00644076, + "epoch": 0.33295256417964286, + "flos": 37706347463040.0, + "grad_norm": 2.7938792802783845, + "language_loss": 0.63053584, + "learning_rate": 3.11219274170529e-06, + "loss": 0.65281701, + "num_input_tokens_seen": 59716435, + "step": 2769, + "time_per_iteration": 2.8349883556365967 + }, + { + "auxiliary_loss_clip": 0.01129185, + "auxiliary_loss_mlp": 0.0108669, + "balance_loss_clip": 1.0319165, + "balance_loss_mlp": 1.00567532, + "epoch": 0.333072807070282, + "flos": 26505989412480.0, + "grad_norm": 1.7460653558648183, + "language_loss": 0.81993687, + "learning_rate": 3.1115452401469903e-06, + "loss": 0.84209561, + "num_input_tokens_seen": 59736835, + "step": 2770, + "time_per_iteration": 2.7756295204162598 + }, + { + "auxiliary_loss_clip": 0.01113619, + "auxiliary_loss_mlp": 0.01088289, + "balance_loss_clip": 1.03311372, + "balance_loss_mlp": 1.00722623, + "epoch": 0.3331930499609211, + "flos": 21430913823360.0, + "grad_norm": 2.012961663115085, + "language_loss": 0.86401594, + "learning_rate": 3.1108975699681613e-06, + "loss": 0.88603508, + "num_input_tokens_seen": 59754230, + "step": 2771, + "time_per_iteration": 2.8264269828796387 + }, + { + "auxiliary_loss_clip": 0.01119823, + "auxiliary_loss_mlp": 0.01087495, + "balance_loss_clip": 1.03111243, + "balance_loss_mlp": 1.00652814, + "epoch": 0.33331329285156014, + "flos": 20659947281280.0, + "grad_norm": 2.0741712159478576, + "language_loss": 0.71563959, + "learning_rate": 3.1102497312670542e-06, + "loss": 0.73771274, + "num_input_tokens_seen": 59772235, + "step": 2772, + "time_per_iteration": 2.792506456375122 + }, + { + "auxiliary_loss_clip": 0.01129323, + "auxiliary_loss_mlp": 0.01086679, + "balance_loss_clip": 1.03223252, + "balance_loss_mlp": 1.00561643, + "epoch": 0.33343353574219925, + "flos": 28001596930560.0, + "grad_norm": 2.333316566951966, + "language_loss": 0.80345982, + "learning_rate": 3.109601724141946e-06, + "loss": 0.82561982, + "num_input_tokens_seen": 59791230, + "step": 2773, + "time_per_iteration": 2.7669668197631836 + }, + { + "auxiliary_loss_clip": 0.01119155, + "auxiliary_loss_mlp": 0.01086696, + "balance_loss_clip": 1.03430748, + "balance_loss_mlp": 1.00544333, + "epoch": 0.33355377863283836, + "flos": 23764963582080.0, + "grad_norm": 2.0580458545227778, + "language_loss": 0.6818881, + "learning_rate": 3.108953548691138e-06, + "loss": 0.70394665, + "num_input_tokens_seen": 59811315, + "step": 2774, + "time_per_iteration": 2.722280740737915 + }, + { + "auxiliary_loss_clip": 0.01151454, + "auxiliary_loss_mlp": 0.01086915, + "balance_loss_clip": 1.03700769, + "balance_loss_mlp": 1.00580478, + "epoch": 0.3336740215234774, + "flos": 37779677078400.0, + "grad_norm": 2.8667884841362348, + "language_loss": 0.72412544, + "learning_rate": 3.108305205012959e-06, + "loss": 0.74650913, + "num_input_tokens_seen": 59832010, + "step": 2775, + "time_per_iteration": 2.8403680324554443 + }, + { + "auxiliary_loss_clip": 0.01129434, + "auxiliary_loss_mlp": 0.01087695, + "balance_loss_clip": 1.03253222, + "balance_loss_mlp": 1.0064894, + "epoch": 0.3337942644141165, + "flos": 25519056347520.0, + "grad_norm": 2.104990147699783, + "language_loss": 0.87755758, + "learning_rate": 3.107656693205761e-06, + "loss": 0.89972889, + "num_input_tokens_seen": 59851450, + "step": 2776, + "time_per_iteration": 2.7877392768859863 + }, + { + "auxiliary_loss_clip": 0.01150632, + "auxiliary_loss_mlp": 0.01088492, + "balance_loss_clip": 1.03529906, + "balance_loss_mlp": 1.00704861, + "epoch": 0.3339145073047556, + "flos": 25989844930560.0, + "grad_norm": 2.5683846831386163, + "language_loss": 0.70290786, + "learning_rate": 3.107008013367924e-06, + "loss": 0.72529912, + "num_input_tokens_seen": 59870245, + "step": 2777, + "time_per_iteration": 2.6537489891052246 + }, + { + "auxiliary_loss_clip": 0.0111818, + "auxiliary_loss_mlp": 0.0108757, + "balance_loss_clip": 1.03077805, + "balance_loss_mlp": 1.00646043, + "epoch": 0.3340347501953947, + "flos": 19062569554560.0, + "grad_norm": 2.0988928467691377, + "language_loss": 0.86601388, + "learning_rate": 3.1063591655978507e-06, + "loss": 0.88807142, + "num_input_tokens_seen": 59886195, + "step": 2778, + "time_per_iteration": 2.7817442417144775 + }, + { + "auxiliary_loss_clip": 0.01096895, + "auxiliary_loss_mlp": 0.01087464, + "balance_loss_clip": 1.03070903, + "balance_loss_mlp": 1.00630653, + "epoch": 0.3341549930860338, + "flos": 18109715518080.0, + "grad_norm": 1.8633558311544731, + "language_loss": 0.79585028, + "learning_rate": 3.105710149993972e-06, + "loss": 0.81769383, + "num_input_tokens_seen": 59905525, + "step": 2779, + "time_per_iteration": 2.7673988342285156 + }, + { + "auxiliary_loss_clip": 0.01149439, + "auxiliary_loss_mlp": 0.01087614, + "balance_loss_clip": 1.034024, + "balance_loss_mlp": 1.00650358, + "epoch": 0.33427523597667286, + "flos": 22674967418880.0, + "grad_norm": 1.958500549835585, + "language_loss": 0.85068274, + "learning_rate": 3.1050609666547427e-06, + "loss": 0.87305331, + "num_input_tokens_seen": 59925085, + "step": 2780, + "time_per_iteration": 2.6451103687286377 + }, + { + "auxiliary_loss_clip": 0.01122645, + "auxiliary_loss_mlp": 0.01087508, + "balance_loss_clip": 1.03347921, + "balance_loss_mlp": 1.00630307, + "epoch": 0.33439547886731197, + "flos": 22638338524800.0, + "grad_norm": 1.9228306614878727, + "language_loss": 0.77188671, + "learning_rate": 3.104411615678644e-06, + "loss": 0.79398823, + "num_input_tokens_seen": 59943935, + "step": 2781, + "time_per_iteration": 2.7502474784851074 + }, + { + "auxiliary_loss_clip": 0.01125581, + "auxiliary_loss_mlp": 0.01088295, + "balance_loss_clip": 1.02997816, + "balance_loss_mlp": 1.00685155, + "epoch": 0.3345157217579511, + "flos": 24096383395200.0, + "grad_norm": 2.4986205301466793, + "language_loss": 0.73498797, + "learning_rate": 3.1037620971641803e-06, + "loss": 0.75712675, + "num_input_tokens_seen": 59963725, + "step": 2782, + "time_per_iteration": 2.7444255352020264 + }, + { + "auxiliary_loss_clip": 0.01148637, + "auxiliary_loss_mlp": 0.01087796, + "balance_loss_clip": 1.03426695, + "balance_loss_mlp": 1.00654328, + "epoch": 0.33463596464859013, + "flos": 18989491334400.0, + "grad_norm": 6.739397392771852, + "language_loss": 0.64350605, + "learning_rate": 3.1031124112098844e-06, + "loss": 0.66587043, + "num_input_tokens_seen": 59981935, + "step": 2783, + "time_per_iteration": 3.6260628700256348 + }, + { + "auxiliary_loss_clip": 0.01129934, + "auxiliary_loss_mlp": 0.01086109, + "balance_loss_clip": 1.03319454, + "balance_loss_mlp": 1.00485635, + "epoch": 0.33475620753922924, + "flos": 20375607219840.0, + "grad_norm": 2.196056880547689, + "language_loss": 0.72119081, + "learning_rate": 3.1024625579143127e-06, + "loss": 0.74335122, + "num_input_tokens_seen": 59999455, + "step": 2784, + "time_per_iteration": 2.696115732192993 + }, + { + "auxiliary_loss_clip": 0.01149181, + "auxiliary_loss_mlp": 0.01087151, + "balance_loss_clip": 1.03454673, + "balance_loss_mlp": 1.00608885, + "epoch": 0.33487645042986836, + "flos": 18182578256640.0, + "grad_norm": 1.9392293443226087, + "language_loss": 0.72954047, + "learning_rate": 3.101812537376048e-06, + "loss": 0.75190383, + "num_input_tokens_seen": 60018475, + "step": 2785, + "time_per_iteration": 2.809934139251709 + }, + { + "auxiliary_loss_clip": 0.01129039, + "auxiliary_loss_mlp": 0.00873436, + "balance_loss_clip": 1.03265953, + "balance_loss_mlp": 1.00019681, + "epoch": 0.3349966933205074, + "flos": 25848824135040.0, + "grad_norm": 1.9237839557313456, + "language_loss": 0.84325039, + "learning_rate": 3.1011623496936973e-06, + "loss": 0.86327511, + "num_input_tokens_seen": 60036770, + "step": 2786, + "time_per_iteration": 2.792163372039795 + }, + { + "auxiliary_loss_clip": 0.01148472, + "auxiliary_loss_mlp": 0.01087471, + "balance_loss_clip": 1.03445029, + "balance_loss_mlp": 1.00636113, + "epoch": 0.3351169362111465, + "flos": 28111447699200.0, + "grad_norm": 2.693121889468736, + "language_loss": 0.69785744, + "learning_rate": 3.100511994965893e-06, + "loss": 0.72021687, + "num_input_tokens_seen": 60056725, + "step": 2787, + "time_per_iteration": 2.685612916946411 + }, + { + "auxiliary_loss_clip": 0.0113077, + "auxiliary_loss_mlp": 0.01086664, + "balance_loss_clip": 1.03152418, + "balance_loss_mlp": 1.00550663, + "epoch": 0.33523717910178563, + "flos": 22673315393280.0, + "grad_norm": 1.7917285446411368, + "language_loss": 0.84401226, + "learning_rate": 3.0998614732912947e-06, + "loss": 0.86618662, + "num_input_tokens_seen": 60076100, + "step": 2788, + "time_per_iteration": 2.6916558742523193 + }, + { + "auxiliary_loss_clip": 0.01138402, + "auxiliary_loss_mlp": 0.01088335, + "balance_loss_clip": 1.03398669, + "balance_loss_mlp": 1.00722528, + "epoch": 0.3353574219924247, + "flos": 15669801400320.0, + "grad_norm": 1.8833626903374154, + "language_loss": 0.67479485, + "learning_rate": 3.0992107847685855e-06, + "loss": 0.69706225, + "num_input_tokens_seen": 60093815, + "step": 2789, + "time_per_iteration": 3.5590500831604004 + }, + { + "auxiliary_loss_clip": 0.01128674, + "auxiliary_loss_mlp": 0.01088354, + "balance_loss_clip": 1.03281069, + "balance_loss_mlp": 1.00705302, + "epoch": 0.3354776648830638, + "flos": 24790644443520.0, + "grad_norm": 3.2470255632479796, + "language_loss": 0.79324561, + "learning_rate": 3.0985599294964736e-06, + "loss": 0.81541586, + "num_input_tokens_seen": 60113370, + "step": 2790, + "time_per_iteration": 3.683647394180298 + }, + { + "auxiliary_loss_clip": 0.01109031, + "auxiliary_loss_mlp": 0.01089357, + "balance_loss_clip": 1.03247595, + "balance_loss_mlp": 1.00786543, + "epoch": 0.33559790777370285, + "flos": 28694852398080.0, + "grad_norm": 2.2109892209733673, + "language_loss": 0.69547153, + "learning_rate": 3.097908907573695e-06, + "loss": 0.71745539, + "num_input_tokens_seen": 60131350, + "step": 2791, + "time_per_iteration": 2.764702320098877 + }, + { + "auxiliary_loss_clip": 0.01097874, + "auxiliary_loss_mlp": 0.01086041, + "balance_loss_clip": 1.02955413, + "balance_loss_mlp": 1.0049305, + "epoch": 0.33571815066434196, + "flos": 22235779825920.0, + "grad_norm": 2.4634104976220894, + "language_loss": 0.89683199, + "learning_rate": 3.0972577190990067e-06, + "loss": 0.91867113, + "num_input_tokens_seen": 60149830, + "step": 2792, + "time_per_iteration": 3.7207462787628174 + }, + { + "auxiliary_loss_clip": 0.01123333, + "auxiliary_loss_mlp": 0.0108568, + "balance_loss_clip": 1.03381228, + "balance_loss_mlp": 1.00466526, + "epoch": 0.3358383935549811, + "flos": 23842279607040.0, + "grad_norm": 2.0199399865427425, + "language_loss": 0.79933727, + "learning_rate": 3.096606364171196e-06, + "loss": 0.82142735, + "num_input_tokens_seen": 60169620, + "step": 2793, + "time_per_iteration": 2.806013345718384 + }, + { + "auxiliary_loss_clip": 0.01115369, + "auxiliary_loss_mlp": 0.01090131, + "balance_loss_clip": 1.03153777, + "balance_loss_mlp": 1.00883055, + "epoch": 0.33595863644562013, + "flos": 22267308988800.0, + "grad_norm": 2.7414953743704267, + "language_loss": 0.8493374, + "learning_rate": 3.0959548428890703e-06, + "loss": 0.87139237, + "num_input_tokens_seen": 60188490, + "step": 2794, + "time_per_iteration": 2.7644217014312744 + }, + { + "auxiliary_loss_clip": 0.01137834, + "auxiliary_loss_mlp": 0.01085583, + "balance_loss_clip": 1.03440034, + "balance_loss_mlp": 1.00437784, + "epoch": 0.33607887933625924, + "flos": 20119779578880.0, + "grad_norm": 1.8502944608391745, + "language_loss": 0.84124112, + "learning_rate": 3.095303155351468e-06, + "loss": 0.86347532, + "num_input_tokens_seen": 60208695, + "step": 2795, + "time_per_iteration": 2.791809558868408 + }, + { + "auxiliary_loss_clip": 0.01110146, + "auxiliary_loss_mlp": 0.01085906, + "balance_loss_clip": 1.03059292, + "balance_loss_mlp": 1.00493932, + "epoch": 0.33619912222689835, + "flos": 19318109886720.0, + "grad_norm": 6.248451666730232, + "language_loss": 0.79286528, + "learning_rate": 3.0946513016572464e-06, + "loss": 0.81482577, + "num_input_tokens_seen": 60227600, + "step": 2796, + "time_per_iteration": 2.734626293182373 + }, + { + "auxiliary_loss_clip": 0.01136808, + "auxiliary_loss_mlp": 0.01089871, + "balance_loss_clip": 1.03140759, + "balance_loss_mlp": 1.00852227, + "epoch": 0.3363193651175374, + "flos": 16800664262400.0, + "grad_norm": 2.129606832516451, + "language_loss": 0.76780492, + "learning_rate": 3.0939992819052938e-06, + "loss": 0.79007173, + "num_input_tokens_seen": 60245110, + "step": 2797, + "time_per_iteration": 2.736175537109375 + }, + { + "auxiliary_loss_clip": 0.01130641, + "auxiliary_loss_mlp": 0.01086898, + "balance_loss_clip": 1.03388977, + "balance_loss_mlp": 1.00578761, + "epoch": 0.3364396080081765, + "flos": 23550289948800.0, + "grad_norm": 2.2467797245648993, + "language_loss": 0.80955052, + "learning_rate": 3.0933470961945193e-06, + "loss": 0.83172584, + "num_input_tokens_seen": 60263405, + "step": 2798, + "time_per_iteration": 2.7514591217041016 + }, + { + "auxiliary_loss_clip": 0.01129018, + "auxiliary_loss_mlp": 0.01086427, + "balance_loss_clip": 1.03338528, + "balance_loss_mlp": 1.00536418, + "epoch": 0.3365598508988156, + "flos": 28037902602240.0, + "grad_norm": 10.789359714609665, + "language_loss": 0.68122578, + "learning_rate": 3.0926947446238597e-06, + "loss": 0.70338023, + "num_input_tokens_seen": 60282975, + "step": 2799, + "time_per_iteration": 2.740044116973877 + }, + { + "auxiliary_loss_clip": 0.01139775, + "auxiliary_loss_mlp": 0.01087338, + "balance_loss_clip": 1.03262329, + "balance_loss_mlp": 1.00603771, + "epoch": 0.3366800937894547, + "flos": 16982767238400.0, + "grad_norm": 2.777268506963139, + "language_loss": 0.82482338, + "learning_rate": 3.092042227292276e-06, + "loss": 0.84709448, + "num_input_tokens_seen": 60299810, + "step": 2800, + "time_per_iteration": 2.703795909881592 + }, + { + "auxiliary_loss_clip": 0.01148386, + "auxiliary_loss_mlp": 0.01086929, + "balance_loss_clip": 1.03384387, + "balance_loss_mlp": 1.00610554, + "epoch": 0.3368003366800938, + "flos": 23915321913600.0, + "grad_norm": 1.8385594887444858, + "language_loss": 0.88124597, + "learning_rate": 3.0913895442987557e-06, + "loss": 0.90359908, + "num_input_tokens_seen": 60320775, + "step": 2801, + "time_per_iteration": 2.6634883880615234 + }, + { + "auxiliary_loss_clip": 0.01119192, + "auxiliary_loss_mlp": 0.00873578, + "balance_loss_clip": 1.0321877, + "balance_loss_mlp": 1.00031495, + "epoch": 0.3369205795707329, + "flos": 24791219061120.0, + "grad_norm": 1.655712067665722, + "language_loss": 0.85612041, + "learning_rate": 3.090736695742308e-06, + "loss": 0.87604809, + "num_input_tokens_seen": 60341905, + "step": 2802, + "time_per_iteration": 2.8089940547943115 + }, + { + "auxiliary_loss_clip": 0.01108678, + "auxiliary_loss_mlp": 0.01085474, + "balance_loss_clip": 1.0286423, + "balance_loss_mlp": 1.00445914, + "epoch": 0.33704082246137196, + "flos": 17931096161280.0, + "grad_norm": 3.57682961088078, + "language_loss": 0.51907116, + "learning_rate": 3.0900836817219713e-06, + "loss": 0.54101264, + "num_input_tokens_seen": 60358335, + "step": 2803, + "time_per_iteration": 2.778717279434204 + }, + { + "auxiliary_loss_clip": 0.01148684, + "auxiliary_loss_mlp": 0.01087314, + "balance_loss_clip": 1.03411877, + "balance_loss_mlp": 1.00620413, + "epoch": 0.33716106535201107, + "flos": 21286517149440.0, + "grad_norm": 7.3179286700561645, + "language_loss": 0.83758712, + "learning_rate": 3.089430502336807e-06, + "loss": 0.85994714, + "num_input_tokens_seen": 60378305, + "step": 2804, + "time_per_iteration": 2.6937754154205322 + }, + { + "auxiliary_loss_clip": 0.01141257, + "auxiliary_loss_mlp": 0.01086125, + "balance_loss_clip": 1.0344913, + "balance_loss_mlp": 1.00491953, + "epoch": 0.3372813082426502, + "flos": 18402962152320.0, + "grad_norm": 2.5573214348839017, + "language_loss": 0.89942694, + "learning_rate": 3.088777157685902e-06, + "loss": 0.92170072, + "num_input_tokens_seen": 60393895, + "step": 2805, + "time_per_iteration": 2.631260633468628 + }, + { + "auxiliary_loss_clip": 0.01125914, + "auxiliary_loss_mlp": 0.01085286, + "balance_loss_clip": 1.02999282, + "balance_loss_mlp": 1.00412798, + "epoch": 0.33740155113328923, + "flos": 17201391367680.0, + "grad_norm": 2.5228900229129834, + "language_loss": 0.85833842, + "learning_rate": 3.088123647868367e-06, + "loss": 0.88045037, + "num_input_tokens_seen": 60410445, + "step": 2806, + "time_per_iteration": 2.7342782020568848 + }, + { + "auxiliary_loss_clip": 0.01141875, + "auxiliary_loss_mlp": 0.01087416, + "balance_loss_clip": 1.03468323, + "balance_loss_mlp": 1.00616288, + "epoch": 0.33752179402392835, + "flos": 29058950609280.0, + "grad_norm": 2.3583696789838253, + "language_loss": 0.81086206, + "learning_rate": 3.0874699729833405e-06, + "loss": 0.83315492, + "num_input_tokens_seen": 60431815, + "step": 2807, + "time_per_iteration": 2.8412489891052246 + }, + { + "auxiliary_loss_clip": 0.01120533, + "auxiliary_loss_mlp": 0.01086358, + "balance_loss_clip": 1.03001475, + "balance_loss_mlp": 1.00529623, + "epoch": 0.3376420369145674, + "flos": 25080730680960.0, + "grad_norm": 1.645699195972463, + "language_loss": 0.79717928, + "learning_rate": 3.086816133129983e-06, + "loss": 0.8192482, + "num_input_tokens_seen": 60452075, + "step": 2808, + "time_per_iteration": 2.7151527404785156 + }, + { + "auxiliary_loss_clip": 0.01152581, + "auxiliary_loss_mlp": 0.01087112, + "balance_loss_clip": 1.03778577, + "balance_loss_mlp": 1.00590706, + "epoch": 0.3377622798052065, + "flos": 27490624007040.0, + "grad_norm": 1.90774072457387, + "language_loss": 0.76091814, + "learning_rate": 3.0861621284074826e-06, + "loss": 0.78331506, + "num_input_tokens_seen": 60472600, + "step": 2809, + "time_per_iteration": 3.6253550052642822 + }, + { + "auxiliary_loss_clip": 0.01121161, + "auxiliary_loss_mlp": 0.01086868, + "balance_loss_clip": 1.03682351, + "balance_loss_mlp": 1.00575781, + "epoch": 0.3378825226958456, + "flos": 21975211589760.0, + "grad_norm": 1.6594078301202473, + "language_loss": 0.7309252, + "learning_rate": 3.085507958915051e-06, + "loss": 0.7530055, + "num_input_tokens_seen": 60491030, + "step": 2810, + "time_per_iteration": 2.7157514095306396 + }, + { + "auxiliary_loss_clip": 0.01126162, + "auxiliary_loss_mlp": 0.01086574, + "balance_loss_clip": 1.03049445, + "balance_loss_mlp": 1.00532055, + "epoch": 0.3380027655864847, + "flos": 42523189200000.0, + "grad_norm": 2.3137832241487564, + "language_loss": 0.71110666, + "learning_rate": 3.084853624751925e-06, + "loss": 0.73323405, + "num_input_tokens_seen": 60512615, + "step": 2811, + "time_per_iteration": 2.9477269649505615 + }, + { + "auxiliary_loss_clip": 0.0112141, + "auxiliary_loss_mlp": 0.01087702, + "balance_loss_clip": 1.03406227, + "balance_loss_mlp": 1.00654459, + "epoch": 0.3381230084771238, + "flos": 26725080418560.0, + "grad_norm": 2.744458599112193, + "language_loss": 0.85905552, + "learning_rate": 3.0841991260173668e-06, + "loss": 0.88114667, + "num_input_tokens_seen": 60532520, + "step": 2812, + "time_per_iteration": 2.77441668510437 + }, + { + "auxiliary_loss_clip": 0.01150441, + "auxiliary_loss_mlp": 0.01087996, + "balance_loss_clip": 1.03556061, + "balance_loss_mlp": 1.00679064, + "epoch": 0.3382432513677629, + "flos": 22710375250560.0, + "grad_norm": 2.2167104265737154, + "language_loss": 0.80564058, + "learning_rate": 3.0835444628106634e-06, + "loss": 0.82802498, + "num_input_tokens_seen": 60551500, + "step": 2813, + "time_per_iteration": 2.715189218521118 + }, + { + "auxiliary_loss_clip": 0.01149205, + "auxiliary_loss_mlp": 0.00873655, + "balance_loss_clip": 1.03461885, + "balance_loss_mlp": 1.00033653, + "epoch": 0.33836349425840195, + "flos": 22122409524480.0, + "grad_norm": 1.7674098387154467, + "language_loss": 0.83078802, + "learning_rate": 3.082889635231126e-06, + "loss": 0.85101664, + "num_input_tokens_seen": 60570160, + "step": 2814, + "time_per_iteration": 2.65031099319458 + }, + { + "auxiliary_loss_clip": 0.01131138, + "auxiliary_loss_mlp": 0.01086178, + "balance_loss_clip": 1.03372943, + "balance_loss_mlp": 1.00502014, + "epoch": 0.33848373714904106, + "flos": 27308090067840.0, + "grad_norm": 3.619994180532077, + "language_loss": 0.76887757, + "learning_rate": 3.0822346433780925e-06, + "loss": 0.79105079, + "num_input_tokens_seen": 60590885, + "step": 2815, + "time_per_iteration": 3.7021396160125732 + }, + { + "auxiliary_loss_clip": 0.01140733, + "auxiliary_loss_mlp": 0.01086401, + "balance_loss_clip": 1.03368771, + "balance_loss_mlp": 1.00519586, + "epoch": 0.3386039800396802, + "flos": 25848716394240.0, + "grad_norm": 3.391696073256515, + "language_loss": 0.87256193, + "learning_rate": 3.0815794873509237e-06, + "loss": 0.89483321, + "num_input_tokens_seen": 60609170, + "step": 2816, + "time_per_iteration": 3.715733051300049 + }, + { + "auxiliary_loss_clip": 0.01148588, + "auxiliary_loss_mlp": 0.01086427, + "balance_loss_clip": 1.03388774, + "balance_loss_mlp": 1.00536442, + "epoch": 0.33872422293031923, + "flos": 18880646146560.0, + "grad_norm": 2.0481582556856477, + "language_loss": 0.72614622, + "learning_rate": 3.0809241672490066e-06, + "loss": 0.74849641, + "num_input_tokens_seen": 60627340, + "step": 2817, + "time_per_iteration": 2.628390312194824 + }, + { + "auxiliary_loss_clip": 0.01130715, + "auxiliary_loss_mlp": 0.01086437, + "balance_loss_clip": 1.03300858, + "balance_loss_mlp": 1.00537443, + "epoch": 0.33884446582095834, + "flos": 23146977064320.0, + "grad_norm": 1.7894201833236933, + "language_loss": 0.84742951, + "learning_rate": 3.080268683171753e-06, + "loss": 0.86960101, + "num_input_tokens_seen": 60647630, + "step": 2818, + "time_per_iteration": 3.6553077697753906 + }, + { + "auxiliary_loss_clip": 0.01140848, + "auxiliary_loss_mlp": 0.01087363, + "balance_loss_clip": 1.03429484, + "balance_loss_mlp": 1.00620532, + "epoch": 0.33896470871159745, + "flos": 15997342544640.0, + "grad_norm": 3.3051082700698164, + "language_loss": 0.89471865, + "learning_rate": 3.0796130352185985e-06, + "loss": 0.91700071, + "num_input_tokens_seen": 60664485, + "step": 2819, + "time_per_iteration": 2.74072265625 + }, + { + "auxiliary_loss_clip": 0.01132072, + "auxiliary_loss_mlp": 0.00873689, + "balance_loss_clip": 1.03319764, + "balance_loss_mlp": 1.00034142, + "epoch": 0.3390849516022365, + "flos": 34495754112000.0, + "grad_norm": 1.8848189240130124, + "language_loss": 0.66354203, + "learning_rate": 3.0789572234890057e-06, + "loss": 0.68359959, + "num_input_tokens_seen": 60686125, + "step": 2820, + "time_per_iteration": 2.8645973205566406 + }, + { + "auxiliary_loss_clip": 0.0112943, + "auxiliary_loss_mlp": 0.01088458, + "balance_loss_clip": 1.03333879, + "balance_loss_mlp": 1.00739598, + "epoch": 0.3392051944928756, + "flos": 16180307447040.0, + "grad_norm": 2.1339143559060134, + "language_loss": 0.77106833, + "learning_rate": 3.0783012480824596e-06, + "loss": 0.79324716, + "num_input_tokens_seen": 60705270, + "step": 2821, + "time_per_iteration": 2.7165369987487793 + }, + { + "auxiliary_loss_clip": 0.01150244, + "auxiliary_loss_mlp": 0.01086922, + "balance_loss_clip": 1.03560841, + "balance_loss_mlp": 1.00571644, + "epoch": 0.33932543738351467, + "flos": 17086656349440.0, + "grad_norm": 2.249437581069339, + "language_loss": 0.74264657, + "learning_rate": 3.077645109098471e-06, + "loss": 0.76501834, + "num_input_tokens_seen": 60721540, + "step": 2822, + "time_per_iteration": 2.5773415565490723 + }, + { + "auxiliary_loss_clip": 0.01122171, + "auxiliary_loss_mlp": 0.01088225, + "balance_loss_clip": 1.03433013, + "balance_loss_mlp": 1.00711513, + "epoch": 0.3394456802741538, + "flos": 22126970551680.0, + "grad_norm": 1.7698880692054717, + "language_loss": 0.71838152, + "learning_rate": 3.076988806636577e-06, + "loss": 0.74048555, + "num_input_tokens_seen": 60739300, + "step": 2823, + "time_per_iteration": 2.777968406677246 + }, + { + "auxiliary_loss_clip": 0.01132054, + "auxiliary_loss_mlp": 0.00873669, + "balance_loss_clip": 1.03500915, + "balance_loss_mlp": 1.00031447, + "epoch": 0.3395659231647929, + "flos": 25226887121280.0, + "grad_norm": 1.919960026028024, + "language_loss": 0.88960075, + "learning_rate": 3.0763323407963377e-06, + "loss": 0.90965801, + "num_input_tokens_seen": 60758910, + "step": 2824, + "time_per_iteration": 2.7286417484283447 + }, + { + "auxiliary_loss_clip": 0.01142878, + "auxiliary_loss_mlp": 0.01087925, + "balance_loss_clip": 1.0359714, + "balance_loss_mlp": 1.00676703, + "epoch": 0.33968616605543195, + "flos": 29096477343360.0, + "grad_norm": 2.495046057066085, + "language_loss": 0.7965942, + "learning_rate": 3.075675711677337e-06, + "loss": 0.81890225, + "num_input_tokens_seen": 60779005, + "step": 2825, + "time_per_iteration": 2.7635810375213623 + }, + { + "auxiliary_loss_clip": 0.01121516, + "auxiliary_loss_mlp": 0.01086721, + "balance_loss_clip": 1.03177643, + "balance_loss_mlp": 1.0058496, + "epoch": 0.33980640894607106, + "flos": 21433966479360.0, + "grad_norm": 1.9921297880935669, + "language_loss": 0.77940518, + "learning_rate": 3.0750189193791865e-06, + "loss": 0.80148751, + "num_input_tokens_seen": 60798590, + "step": 2826, + "time_per_iteration": 2.6722018718719482 + }, + { + "auxiliary_loss_clip": 0.01143284, + "auxiliary_loss_mlp": 0.01087349, + "balance_loss_clip": 1.03665781, + "balance_loss_mlp": 1.00623846, + "epoch": 0.33992665183671017, + "flos": 32490035596800.0, + "grad_norm": 1.890201972086583, + "language_loss": 0.69928801, + "learning_rate": 3.0743619640015203e-06, + "loss": 0.72159433, + "num_input_tokens_seen": 60818840, + "step": 2827, + "time_per_iteration": 2.7473037242889404 + }, + { + "auxiliary_loss_clip": 0.01134116, + "auxiliary_loss_mlp": 0.01086757, + "balance_loss_clip": 1.03525627, + "balance_loss_mlp": 1.00578976, + "epoch": 0.3400468947273492, + "flos": 17055414495360.0, + "grad_norm": 2.050604996695987, + "language_loss": 0.92496467, + "learning_rate": 3.073704845643999e-06, + "loss": 0.94717342, + "num_input_tokens_seen": 60835965, + "step": 2828, + "time_per_iteration": 2.70701003074646 + }, + { + "auxiliary_loss_clip": 0.01142379, + "auxiliary_loss_mlp": 0.01088636, + "balance_loss_clip": 1.03568637, + "balance_loss_mlp": 1.00728738, + "epoch": 0.34016713761798834, + "flos": 16872988296960.0, + "grad_norm": 3.6641873544723738, + "language_loss": 0.77167177, + "learning_rate": 3.0730475644063063e-06, + "loss": 0.79398191, + "num_input_tokens_seen": 60851065, + "step": 2829, + "time_per_iteration": 2.5880613327026367 + }, + { + "auxiliary_loss_clip": 0.01118955, + "auxiliary_loss_mlp": 0.00873458, + "balance_loss_clip": 1.03495884, + "balance_loss_mlp": 1.00036883, + "epoch": 0.34028738050862745, + "flos": 21907161273600.0, + "grad_norm": 1.7406912537794397, + "language_loss": 0.64823097, + "learning_rate": 3.072390120388151e-06, + "loss": 0.66815507, + "num_input_tokens_seen": 60869390, + "step": 2830, + "time_per_iteration": 2.736367702484131 + }, + { + "auxiliary_loss_clip": 0.01139158, + "auxiliary_loss_mlp": 0.01087274, + "balance_loss_clip": 1.03435421, + "balance_loss_mlp": 1.0059731, + "epoch": 0.3404076233992665, + "flos": 22746034477440.0, + "grad_norm": 2.08252214789104, + "language_loss": 0.71217579, + "learning_rate": 3.071732513689267e-06, + "loss": 0.73444009, + "num_input_tokens_seen": 60887925, + "step": 2831, + "time_per_iteration": 2.7075352668762207 + }, + { + "auxiliary_loss_clip": 0.01135983, + "auxiliary_loss_mlp": 0.01087502, + "balance_loss_clip": 1.03243995, + "balance_loss_mlp": 1.00634408, + "epoch": 0.3405278662899056, + "flos": 17052361839360.0, + "grad_norm": 2.4224362182590338, + "language_loss": 0.67608523, + "learning_rate": 3.0710747444094134e-06, + "loss": 0.69832003, + "num_input_tokens_seen": 60905955, + "step": 2832, + "time_per_iteration": 2.646196126937866 + }, + { + "auxiliary_loss_clip": 0.01131224, + "auxiliary_loss_mlp": 0.01086977, + "balance_loss_clip": 1.03467178, + "balance_loss_mlp": 1.005867, + "epoch": 0.3406481091805447, + "flos": 42813131783040.0, + "grad_norm": 2.1179742822946186, + "language_loss": 0.64817721, + "learning_rate": 3.070416812648372e-06, + "loss": 0.67035925, + "num_input_tokens_seen": 60929405, + "step": 2833, + "time_per_iteration": 2.8924460411071777 + }, + { + "auxiliary_loss_clip": 0.01126439, + "auxiliary_loss_mlp": 0.0108613, + "balance_loss_clip": 1.03568804, + "balance_loss_mlp": 1.00511503, + "epoch": 0.3407683520711838, + "flos": 26761457917440.0, + "grad_norm": 2.000626906876136, + "language_loss": 0.64962971, + "learning_rate": 3.069758718505951e-06, + "loss": 0.67175543, + "num_input_tokens_seen": 60951145, + "step": 2834, + "time_per_iteration": 2.82517147064209 + }, + { + "auxiliary_loss_clip": 0.01151866, + "auxiliary_loss_mlp": 0.01087555, + "balance_loss_clip": 1.03762543, + "balance_loss_mlp": 1.00644493, + "epoch": 0.3408885949618229, + "flos": 28767643309440.0, + "grad_norm": 2.1603434801419708, + "language_loss": 0.79677689, + "learning_rate": 3.0691004620819836e-06, + "loss": 0.81917107, + "num_input_tokens_seen": 60971275, + "step": 2835, + "time_per_iteration": 3.6042494773864746 + }, + { + "auxiliary_loss_clip": 0.01119892, + "auxiliary_loss_mlp": 0.01084135, + "balance_loss_clip": 1.0530827, + "balance_loss_mlp": 1.00478923, + "epoch": 0.341008837852462, + "flos": 63576252881280.0, + "grad_norm": 1.0630547326145745, + "language_loss": 0.60127771, + "learning_rate": 3.0684420434763254e-06, + "loss": 0.62331796, + "num_input_tokens_seen": 61037460, + "step": 2836, + "time_per_iteration": 3.39296293258667 + }, + { + "auxiliary_loss_clip": 0.01114913, + "auxiliary_loss_mlp": 0.01088283, + "balance_loss_clip": 1.03255951, + "balance_loss_mlp": 1.00722098, + "epoch": 0.34112908074310105, + "flos": 20812173120000.0, + "grad_norm": 1.956148661309152, + "language_loss": 0.76615191, + "learning_rate": 3.06778346278886e-06, + "loss": 0.78818393, + "num_input_tokens_seen": 61056295, + "step": 2837, + "time_per_iteration": 2.8025333881378174 + }, + { + "auxiliary_loss_clip": 0.01151091, + "auxiliary_loss_mlp": 0.01086804, + "balance_loss_clip": 1.03701234, + "balance_loss_mlp": 1.00545549, + "epoch": 0.34124932363374016, + "flos": 24976446520320.0, + "grad_norm": 1.7906478219679025, + "language_loss": 0.79185247, + "learning_rate": 3.0671247201194906e-06, + "loss": 0.81423146, + "num_input_tokens_seen": 61078430, + "step": 2838, + "time_per_iteration": 2.6751344203948975 + }, + { + "auxiliary_loss_clip": 0.011214, + "auxiliary_loss_mlp": 0.01087642, + "balance_loss_clip": 1.0330565, + "balance_loss_mlp": 1.00629306, + "epoch": 0.3413695665243792, + "flos": 28402970480640.0, + "grad_norm": 2.0611752568156816, + "language_loss": 0.75462627, + "learning_rate": 3.066465815568151e-06, + "loss": 0.77671665, + "num_input_tokens_seen": 61099260, + "step": 2839, + "time_per_iteration": 3.806987762451172 + }, + { + "auxiliary_loss_clip": 0.01141058, + "auxiliary_loss_mlp": 0.01087547, + "balance_loss_clip": 1.03427911, + "balance_loss_mlp": 1.00638914, + "epoch": 0.34148980941501833, + "flos": 25302012416640.0, + "grad_norm": 1.7413302510705084, + "language_loss": 0.68823075, + "learning_rate": 3.0658067492347947e-06, + "loss": 0.71051681, + "num_input_tokens_seen": 61121900, + "step": 2840, + "time_per_iteration": 2.75069260597229 + }, + { + "auxiliary_loss_clip": 0.01090403, + "auxiliary_loss_mlp": 0.01086951, + "balance_loss_clip": 1.02925014, + "balance_loss_mlp": 1.00584066, + "epoch": 0.34161005230565744, + "flos": 17530081747200.0, + "grad_norm": 8.615125034676158, + "language_loss": 0.66304195, + "learning_rate": 3.065147521219402e-06, + "loss": 0.68481553, + "num_input_tokens_seen": 61141155, + "step": 2841, + "time_per_iteration": 3.857556104660034 + }, + { + "auxiliary_loss_clip": 0.01130258, + "auxiliary_loss_mlp": 0.01086359, + "balance_loss_clip": 1.03295588, + "balance_loss_mlp": 1.00529718, + "epoch": 0.3417302951962965, + "flos": 43650101566080.0, + "grad_norm": 4.037202892853723, + "language_loss": 0.74374783, + "learning_rate": 3.064488131621977e-06, + "loss": 0.76591408, + "num_input_tokens_seen": 61164480, + "step": 2842, + "time_per_iteration": 2.8888092041015625 + }, + { + "auxiliary_loss_clip": 0.01141693, + "auxiliary_loss_mlp": 0.0108884, + "balance_loss_clip": 1.0352819, + "balance_loss_mlp": 1.00772977, + "epoch": 0.3418505380869356, + "flos": 30882207012480.0, + "grad_norm": 1.7945995500940275, + "language_loss": 0.74080658, + "learning_rate": 3.063828580542549e-06, + "loss": 0.76311189, + "num_input_tokens_seen": 61185675, + "step": 2843, + "time_per_iteration": 3.613161563873291 + }, + { + "auxiliary_loss_clip": 0.01131674, + "auxiliary_loss_mlp": 0.0108613, + "balance_loss_clip": 1.03425014, + "balance_loss_mlp": 1.00525796, + "epoch": 0.3419707809775747, + "flos": 19463871277440.0, + "grad_norm": 1.8033086501564146, + "language_loss": 0.73411846, + "learning_rate": 3.0631688680811706e-06, + "loss": 0.7562964, + "num_input_tokens_seen": 61205300, + "step": 2844, + "time_per_iteration": 2.7090184688568115 + }, + { + "auxiliary_loss_clip": 0.01149393, + "auxiliary_loss_mlp": 0.01086907, + "balance_loss_clip": 1.03449488, + "balance_loss_mlp": 1.00579691, + "epoch": 0.3420910238682138, + "flos": 28727818104960.0, + "grad_norm": 2.233375190157134, + "language_loss": 0.75517106, + "learning_rate": 3.062508994337921e-06, + "loss": 0.77753413, + "num_input_tokens_seen": 61224905, + "step": 2845, + "time_per_iteration": 2.759194850921631 + }, + { + "auxiliary_loss_clip": 0.01138499, + "auxiliary_loss_mlp": 0.01088045, + "balance_loss_clip": 1.03281558, + "balance_loss_mlp": 1.00683951, + "epoch": 0.3422112667588529, + "flos": 21397265758080.0, + "grad_norm": 2.186483588234401, + "language_loss": 0.79368269, + "learning_rate": 3.0618489594129013e-06, + "loss": 0.81594813, + "num_input_tokens_seen": 61243045, + "step": 2846, + "time_per_iteration": 2.734494924545288 + }, + { + "auxiliary_loss_clip": 0.01121799, + "auxiliary_loss_mlp": 0.01087561, + "balance_loss_clip": 1.03387737, + "balance_loss_mlp": 1.00630832, + "epoch": 0.342331509649492, + "flos": 13881450038400.0, + "grad_norm": 1.8515023862070081, + "language_loss": 0.71135038, + "learning_rate": 3.061188763406239e-06, + "loss": 0.73344398, + "num_input_tokens_seen": 61259190, + "step": 2847, + "time_per_iteration": 2.769680976867676 + }, + { + "auxiliary_loss_clip": 0.01133875, + "auxiliary_loss_mlp": 0.01087323, + "balance_loss_clip": 1.03599453, + "balance_loss_mlp": 1.00611782, + "epoch": 0.34245175254013105, + "flos": 28621450955520.0, + "grad_norm": 2.6439581733544504, + "language_loss": 0.82176888, + "learning_rate": 3.060528406418085e-06, + "loss": 0.84398091, + "num_input_tokens_seen": 61279040, + "step": 2848, + "time_per_iteration": 2.9136040210723877 + }, + { + "auxiliary_loss_clip": 0.01134323, + "auxiliary_loss_mlp": 0.01087713, + "balance_loss_clip": 1.03598404, + "balance_loss_mlp": 1.00665104, + "epoch": 0.34257199543077016, + "flos": 34127058960000.0, + "grad_norm": 2.533561586546064, + "language_loss": 0.61961067, + "learning_rate": 3.0598678885486145e-06, + "loss": 0.64183104, + "num_input_tokens_seen": 61301580, + "step": 2849, + "time_per_iteration": 2.9102554321289062 + }, + { + "auxiliary_loss_clip": 0.01112185, + "auxiliary_loss_mlp": 0.00873498, + "balance_loss_clip": 1.03583479, + "balance_loss_mlp": 1.00021553, + "epoch": 0.34269223832140927, + "flos": 19974018188160.0, + "grad_norm": 1.7726483963758644, + "language_loss": 0.74754149, + "learning_rate": 3.0592072098980282e-06, + "loss": 0.76739836, + "num_input_tokens_seen": 61321240, + "step": 2850, + "time_per_iteration": 2.8465728759765625 + }, + { + "auxiliary_loss_clip": 0.01125686, + "auxiliary_loss_mlp": 0.01086289, + "balance_loss_clip": 1.02968252, + "balance_loss_mlp": 1.00532222, + "epoch": 0.3428124812120483, + "flos": 27235658292480.0, + "grad_norm": 2.531659656182376, + "language_loss": 0.73178601, + "learning_rate": 3.0585463705665514e-06, + "loss": 0.75390577, + "num_input_tokens_seen": 61341615, + "step": 2851, + "time_per_iteration": 2.8661553859710693 + }, + { + "auxiliary_loss_clip": 0.01124529, + "auxiliary_loss_mlp": 0.01085424, + "balance_loss_clip": 1.03459263, + "balance_loss_mlp": 1.00436127, + "epoch": 0.34293272410268744, + "flos": 24570871079040.0, + "grad_norm": 2.2278992627040086, + "language_loss": 0.70824468, + "learning_rate": 3.0578853706544304e-06, + "loss": 0.73034418, + "num_input_tokens_seen": 61359005, + "step": 2852, + "time_per_iteration": 2.9616870880126953 + }, + { + "auxiliary_loss_clip": 0.01119551, + "auxiliary_loss_mlp": 0.00873483, + "balance_loss_clip": 1.03198099, + "balance_loss_mlp": 1.00027966, + "epoch": 0.34305296699332655, + "flos": 21506865131520.0, + "grad_norm": 1.8841547781437513, + "language_loss": 0.65188301, + "learning_rate": 3.0572242102619404e-06, + "loss": 0.67181331, + "num_input_tokens_seen": 61376160, + "step": 2853, + "time_per_iteration": 2.789470672607422 + }, + { + "auxiliary_loss_clip": 0.0112805, + "auxiliary_loss_mlp": 0.01087075, + "balance_loss_clip": 1.03229809, + "balance_loss_mlp": 1.00601292, + "epoch": 0.3431732098839656, + "flos": 24056665931520.0, + "grad_norm": 2.1167168442538142, + "language_loss": 0.80575812, + "learning_rate": 3.0565628894893784e-06, + "loss": 0.82790935, + "num_input_tokens_seen": 61396795, + "step": 2854, + "time_per_iteration": 2.8135666847229004 + }, + { + "auxiliary_loss_clip": 0.01143884, + "auxiliary_loss_mlp": 0.0108727, + "balance_loss_clip": 1.03703415, + "balance_loss_mlp": 1.00620794, + "epoch": 0.3432934527746047, + "flos": 16800879744000.0, + "grad_norm": 1.784413769525497, + "language_loss": 0.74879742, + "learning_rate": 3.0559014084370655e-06, + "loss": 0.77110898, + "num_input_tokens_seen": 61415320, + "step": 2855, + "time_per_iteration": 2.753951072692871 + }, + { + "auxiliary_loss_clip": 0.0113322, + "auxiliary_loss_mlp": 0.01086764, + "balance_loss_clip": 1.03507125, + "balance_loss_mlp": 1.00551069, + "epoch": 0.34341369566524377, + "flos": 23439720908160.0, + "grad_norm": 1.7844369104988782, + "language_loss": 0.78684503, + "learning_rate": 3.055239767205349e-06, + "loss": 0.80904484, + "num_input_tokens_seen": 61437070, + "step": 2856, + "time_per_iteration": 2.8525991439819336 + }, + { + "auxiliary_loss_clip": 0.01142459, + "auxiliary_loss_mlp": 0.01088396, + "balance_loss_clip": 1.03749228, + "balance_loss_mlp": 1.0073812, + "epoch": 0.3435339385558829, + "flos": 17267466435840.0, + "grad_norm": 2.0552844210352887, + "language_loss": 0.78176188, + "learning_rate": 3.054577965894599e-06, + "loss": 0.80407035, + "num_input_tokens_seen": 61453215, + "step": 2857, + "time_per_iteration": 2.7274858951568604 + }, + { + "auxiliary_loss_clip": 0.01120059, + "auxiliary_loss_mlp": 0.01087281, + "balance_loss_clip": 1.03631485, + "balance_loss_mlp": 1.00612307, + "epoch": 0.343654181446522, + "flos": 22199366413440.0, + "grad_norm": 1.4433770383554294, + "language_loss": 0.701253, + "learning_rate": 3.0539160046052094e-06, + "loss": 0.72332633, + "num_input_tokens_seen": 61472915, + "step": 2858, + "time_per_iteration": 2.7119486331939697 + }, + { + "auxiliary_loss_clip": 0.0112978, + "auxiliary_loss_mlp": 0.01086771, + "balance_loss_clip": 1.03271914, + "balance_loss_mlp": 1.00542283, + "epoch": 0.34377442433716104, + "flos": 19901801894400.0, + "grad_norm": 2.533562627331143, + "language_loss": 0.70525324, + "learning_rate": 3.0532538834376003e-06, + "loss": 0.72741878, + "num_input_tokens_seen": 61492475, + "step": 2859, + "time_per_iteration": 2.7564425468444824 + }, + { + "auxiliary_loss_clip": 0.01143347, + "auxiliary_loss_mlp": 0.01088549, + "balance_loss_clip": 1.03620124, + "balance_loss_mlp": 1.00729632, + "epoch": 0.34389466722780015, + "flos": 22197678474240.0, + "grad_norm": 1.7567615669508476, + "language_loss": 0.78163713, + "learning_rate": 3.0525916024922143e-06, + "loss": 0.80395609, + "num_input_tokens_seen": 61511660, + "step": 2860, + "time_per_iteration": 3.5503320693969727 + }, + { + "auxiliary_loss_clip": 0.01135614, + "auxiliary_loss_mlp": 0.01087356, + "balance_loss_clip": 1.03676105, + "balance_loss_mlp": 1.0063411, + "epoch": 0.34401491011843927, + "flos": 18624567110400.0, + "grad_norm": 2.8530399526030186, + "language_loss": 0.84306264, + "learning_rate": 3.0519291618695193e-06, + "loss": 0.86529243, + "num_input_tokens_seen": 61529060, + "step": 2861, + "time_per_iteration": 2.802058696746826 + }, + { + "auxiliary_loss_clip": 0.01121687, + "auxiliary_loss_mlp": 0.01086724, + "balance_loss_clip": 1.03271711, + "balance_loss_mlp": 1.00566173, + "epoch": 0.3441351530090783, + "flos": 17858197509120.0, + "grad_norm": 1.537006447859716, + "language_loss": 0.75542045, + "learning_rate": 3.0512665616700065e-06, + "loss": 0.77750456, + "num_input_tokens_seen": 61548125, + "step": 2862, + "time_per_iteration": 2.887873888015747 + }, + { + "auxiliary_loss_clip": 0.01112442, + "auxiliary_loss_mlp": 0.01085613, + "balance_loss_clip": 1.03198588, + "balance_loss_mlp": 1.00455081, + "epoch": 0.34425539589971743, + "flos": 23112754381440.0, + "grad_norm": 1.9835709190248234, + "language_loss": 0.89089727, + "learning_rate": 3.0506038019941933e-06, + "loss": 0.9128778, + "num_input_tokens_seen": 61568135, + "step": 2863, + "time_per_iteration": 2.9249846935272217 + }, + { + "auxiliary_loss_clip": 0.01119641, + "auxiliary_loss_mlp": 0.01086766, + "balance_loss_clip": 1.03240108, + "balance_loss_mlp": 1.00551343, + "epoch": 0.34437563879035654, + "flos": 21907699977600.0, + "grad_norm": 2.8132486699170998, + "language_loss": 0.67830801, + "learning_rate": 3.049940882942617e-06, + "loss": 0.7003721, + "num_input_tokens_seen": 61586920, + "step": 2864, + "time_per_iteration": 3.8514044284820557 + }, + { + "auxiliary_loss_clip": 0.0115093, + "auxiliary_loss_mlp": 0.01085933, + "balance_loss_clip": 1.03599179, + "balance_loss_mlp": 1.00487125, + "epoch": 0.3444958816809956, + "flos": 23076915586560.0, + "grad_norm": 2.631277165994145, + "language_loss": 0.80063587, + "learning_rate": 3.0492778046158448e-06, + "loss": 0.82300448, + "num_input_tokens_seen": 61608340, + "step": 2865, + "time_per_iteration": 2.753016471862793 + }, + { + "auxiliary_loss_clip": 0.01140648, + "auxiliary_loss_mlp": 0.01087371, + "balance_loss_clip": 1.0360719, + "balance_loss_mlp": 1.00630844, + "epoch": 0.3446161245716347, + "flos": 21908633731200.0, + "grad_norm": 2.163345120270574, + "language_loss": 0.76722753, + "learning_rate": 3.0486145671144633e-06, + "loss": 0.78950769, + "num_input_tokens_seen": 61628130, + "step": 2866, + "time_per_iteration": 4.247422933578491 + }, + { + "auxiliary_loss_clip": 0.01092592, + "auxiliary_loss_mlp": 0.01087312, + "balance_loss_clip": 1.02748811, + "balance_loss_mlp": 1.00610709, + "epoch": 0.3447363674622738, + "flos": 25112834461440.0, + "grad_norm": 2.997422363408704, + "language_loss": 0.76700866, + "learning_rate": 3.047951170539086e-06, + "loss": 0.78880769, + "num_input_tokens_seen": 61647755, + "step": 2867, + "time_per_iteration": 2.9683871269226074 + }, + { + "auxiliary_loss_clip": 0.01115456, + "auxiliary_loss_mlp": 0.01086372, + "balance_loss_clip": 1.0289309, + "balance_loss_mlp": 1.00550067, + "epoch": 0.3448566103529129, + "flos": 11984684451840.0, + "grad_norm": 1.8550856929448272, + "language_loss": 0.84532523, + "learning_rate": 3.047287614990349e-06, + "loss": 0.86734354, + "num_input_tokens_seen": 61665675, + "step": 2868, + "time_per_iteration": 3.772104501724243 + }, + { + "auxiliary_loss_clip": 0.01130913, + "auxiliary_loss_mlp": 0.01086956, + "balance_loss_clip": 1.0335598, + "balance_loss_mlp": 1.00575054, + "epoch": 0.344976853243552, + "flos": 40187882465280.0, + "grad_norm": 2.868787702360646, + "language_loss": 0.61666191, + "learning_rate": 3.046623900568914e-06, + "loss": 0.63884056, + "num_input_tokens_seen": 61688240, + "step": 2869, + "time_per_iteration": 2.93166446685791 + }, + { + "auxiliary_loss_clip": 0.01126615, + "auxiliary_loss_mlp": 0.01086073, + "balance_loss_clip": 1.030527, + "balance_loss_mlp": 1.00482011, + "epoch": 0.34509709613419104, + "flos": 28723652127360.0, + "grad_norm": 2.65678408316972, + "language_loss": 0.70167851, + "learning_rate": 3.045960027375465e-06, + "loss": 0.72380543, + "num_input_tokens_seen": 61706075, + "step": 2870, + "time_per_iteration": 2.842120885848999 + }, + { + "auxiliary_loss_clip": 0.01140009, + "auxiliary_loss_mlp": 0.01086346, + "balance_loss_clip": 1.03306341, + "balance_loss_mlp": 1.00528312, + "epoch": 0.34521733902483015, + "flos": 29967597982080.0, + "grad_norm": 3.5479498595392704, + "language_loss": 0.8227756, + "learning_rate": 3.045295995510711e-06, + "loss": 0.84503913, + "num_input_tokens_seen": 61723045, + "step": 2871, + "time_per_iteration": 2.931095838546753 + }, + { + "auxiliary_loss_clip": 0.01126342, + "auxiliary_loss_mlp": 0.01087927, + "balance_loss_clip": 1.03082776, + "balance_loss_mlp": 1.00700796, + "epoch": 0.34533758191546926, + "flos": 27923059843200.0, + "grad_norm": 1.7226797713444575, + "language_loss": 0.73629844, + "learning_rate": 3.0446318050753865e-06, + "loss": 0.75844109, + "num_input_tokens_seen": 61743525, + "step": 2872, + "time_per_iteration": 2.9955315589904785 + }, + { + "auxiliary_loss_clip": 0.01141023, + "auxiliary_loss_mlp": 0.01088452, + "balance_loss_clip": 1.03389168, + "balance_loss_mlp": 1.00753307, + "epoch": 0.3454578248061083, + "flos": 27125879351040.0, + "grad_norm": 2.2188508458645497, + "language_loss": 0.77474403, + "learning_rate": 3.0439674561702474e-06, + "loss": 0.79703879, + "num_input_tokens_seen": 61763025, + "step": 2873, + "time_per_iteration": 2.789182424545288 + }, + { + "auxiliary_loss_clip": 0.01139397, + "auxiliary_loss_mlp": 0.01086436, + "balance_loss_clip": 1.03456116, + "balance_loss_mlp": 1.00546908, + "epoch": 0.3455780676967474, + "flos": 19024899166080.0, + "grad_norm": 1.9955811486467123, + "language_loss": 0.87817138, + "learning_rate": 3.043302948896076e-06, + "loss": 0.90042973, + "num_input_tokens_seen": 61781630, + "step": 2874, + "time_per_iteration": 2.7429511547088623 + }, + { + "auxiliary_loss_clip": 0.01110811, + "auxiliary_loss_mlp": 0.01087682, + "balance_loss_clip": 1.03163052, + "balance_loss_mlp": 1.00652421, + "epoch": 0.34569831058738654, + "flos": 34496005507200.0, + "grad_norm": 2.066723252809286, + "language_loss": 0.60489428, + "learning_rate": 3.0426382833536756e-06, + "loss": 0.62687922, + "num_input_tokens_seen": 61804985, + "step": 2875, + "time_per_iteration": 3.0467777252197266 + }, + { + "auxiliary_loss_clip": 0.01123776, + "auxiliary_loss_mlp": 0.01086708, + "balance_loss_clip": 1.03398848, + "balance_loss_mlp": 1.00564563, + "epoch": 0.3458185534780256, + "flos": 31138681098240.0, + "grad_norm": 2.736057011391462, + "language_loss": 0.7760129, + "learning_rate": 3.041973459643877e-06, + "loss": 0.79811776, + "num_input_tokens_seen": 61824440, + "step": 2876, + "time_per_iteration": 2.8646466732025146 + }, + { + "auxiliary_loss_clip": 0.01114238, + "auxiliary_loss_mlp": 0.01085543, + "balance_loss_clip": 1.0321337, + "balance_loss_mlp": 1.00448036, + "epoch": 0.3459387963686647, + "flos": 32452508862720.0, + "grad_norm": 12.294642577306552, + "language_loss": 0.66891092, + "learning_rate": 3.0413084778675334e-06, + "loss": 0.69090867, + "num_input_tokens_seen": 61845690, + "step": 2877, + "time_per_iteration": 2.998183488845825 + }, + { + "auxiliary_loss_clip": 0.01133341, + "auxiliary_loss_mlp": 0.00873465, + "balance_loss_clip": 1.03482878, + "balance_loss_mlp": 1.00024199, + "epoch": 0.3460590392593038, + "flos": 24675658030080.0, + "grad_norm": 1.9146012082123116, + "language_loss": 0.84103, + "learning_rate": 3.0406433381255214e-06, + "loss": 0.86109805, + "num_input_tokens_seen": 61863725, + "step": 2878, + "time_per_iteration": 2.818814992904663 + }, + { + "auxiliary_loss_clip": 0.01138591, + "auxiliary_loss_mlp": 0.01086427, + "balance_loss_clip": 1.03460896, + "balance_loss_mlp": 1.00531662, + "epoch": 0.34617928214994287, + "flos": 18807316531200.0, + "grad_norm": 2.374657420268208, + "language_loss": 0.82039976, + "learning_rate": 3.0399780405187425e-06, + "loss": 0.84264994, + "num_input_tokens_seen": 61882720, + "step": 2879, + "time_per_iteration": 2.7811920642852783 + }, + { + "auxiliary_loss_clip": 0.01141144, + "auxiliary_loss_mlp": 0.01087472, + "balance_loss_clip": 1.03446817, + "balance_loss_mlp": 1.00650501, + "epoch": 0.346299525040582, + "flos": 24857653265280.0, + "grad_norm": 1.8086117376234019, + "language_loss": 0.78295922, + "learning_rate": 3.0393125851481216e-06, + "loss": 0.8052454, + "num_input_tokens_seen": 61902595, + "step": 2880, + "time_per_iteration": 2.8498401641845703 + }, + { + "auxiliary_loss_clip": 0.01121496, + "auxiliary_loss_mlp": 0.01086062, + "balance_loss_clip": 1.03309155, + "balance_loss_mlp": 1.00490475, + "epoch": 0.3464197679312211, + "flos": 16434914025600.0, + "grad_norm": 3.8107161476768168, + "language_loss": 0.86681259, + "learning_rate": 3.038646972114608e-06, + "loss": 0.88888812, + "num_input_tokens_seen": 61918920, + "step": 2881, + "time_per_iteration": 2.760563373565674 + }, + { + "auxiliary_loss_clip": 0.0111605, + "auxiliary_loss_mlp": 0.01086982, + "balance_loss_clip": 1.03032064, + "balance_loss_mlp": 1.00601506, + "epoch": 0.34654001082186014, + "flos": 22382474970240.0, + "grad_norm": 1.848644959453443, + "language_loss": 0.67584717, + "learning_rate": 3.037981201519174e-06, + "loss": 0.69787747, + "num_input_tokens_seen": 61939520, + "step": 2882, + "time_per_iteration": 2.8742470741271973 + }, + { + "auxiliary_loss_clip": 0.01140544, + "auxiliary_loss_mlp": 0.01086555, + "balance_loss_clip": 1.0354352, + "balance_loss_mlp": 1.00563622, + "epoch": 0.34666025371249926, + "flos": 19573901614080.0, + "grad_norm": 8.584617123957402, + "language_loss": 0.71361661, + "learning_rate": 3.0373152734628175e-06, + "loss": 0.73588765, + "num_input_tokens_seen": 61957800, + "step": 2883, + "time_per_iteration": 2.7737045288085938 + }, + { + "auxiliary_loss_clip": 0.01141299, + "auxiliary_loss_mlp": 0.01087835, + "balance_loss_clip": 1.03410554, + "balance_loss_mlp": 1.0068202, + "epoch": 0.34678049660313837, + "flos": 15267637751040.0, + "grad_norm": 2.365259264550678, + "language_loss": 0.76123917, + "learning_rate": 3.0366491880465584e-06, + "loss": 0.78353047, + "num_input_tokens_seen": 61975820, + "step": 2884, + "time_per_iteration": 2.7917776107788086 + }, + { + "auxiliary_loss_clip": 0.01153209, + "auxiliary_loss_mlp": 0.01086157, + "balance_loss_clip": 1.03783679, + "balance_loss_mlp": 1.00509477, + "epoch": 0.3469007394937774, + "flos": 21181550630400.0, + "grad_norm": 1.8473265320237064, + "language_loss": 0.8220675, + "learning_rate": 3.035982945371443e-06, + "loss": 0.8444612, + "num_input_tokens_seen": 61997515, + "step": 2885, + "time_per_iteration": 2.755490779876709 + }, + { + "auxiliary_loss_clip": 0.01133727, + "auxiliary_loss_mlp": 0.01088396, + "balance_loss_clip": 1.03495371, + "balance_loss_mlp": 1.00742865, + "epoch": 0.34702098238441653, + "flos": 22375471818240.0, + "grad_norm": 2.1277675756531744, + "language_loss": 0.85128033, + "learning_rate": 3.035316545538537e-06, + "loss": 0.87350154, + "num_input_tokens_seen": 62016310, + "step": 2886, + "time_per_iteration": 3.7250280380249023 + }, + { + "auxiliary_loss_clip": 0.01124074, + "auxiliary_loss_mlp": 0.01086939, + "balance_loss_clip": 1.03005528, + "balance_loss_mlp": 1.00601983, + "epoch": 0.3471412252750556, + "flos": 22929430343040.0, + "grad_norm": 1.8342273349250664, + "language_loss": 0.79017866, + "learning_rate": 3.034649988648935e-06, + "loss": 0.81228876, + "num_input_tokens_seen": 62036075, + "step": 2887, + "time_per_iteration": 2.839165449142456 + }, + { + "auxiliary_loss_clip": 0.01130256, + "auxiliary_loss_mlp": 0.01087015, + "balance_loss_clip": 1.0329659, + "balance_loss_mlp": 1.00585699, + "epoch": 0.3472614681656947, + "flos": 21324259365120.0, + "grad_norm": 1.7076684733399268, + "language_loss": 0.80479276, + "learning_rate": 3.033983274803752e-06, + "loss": 0.82696545, + "num_input_tokens_seen": 62055865, + "step": 2888, + "time_per_iteration": 2.7807164192199707 + }, + { + "auxiliary_loss_clip": 0.01136353, + "auxiliary_loss_mlp": 0.01087585, + "balance_loss_clip": 1.03764379, + "balance_loss_mlp": 1.00647509, + "epoch": 0.3473817110563338, + "flos": 23475739271040.0, + "grad_norm": 2.4647098194714196, + "language_loss": 0.72366297, + "learning_rate": 3.0333164041041283e-06, + "loss": 0.74590236, + "num_input_tokens_seen": 62072180, + "step": 2889, + "time_per_iteration": 2.9046707153320312 + }, + { + "auxiliary_loss_clip": 0.01106207, + "auxiliary_loss_mlp": 0.01085172, + "balance_loss_clip": 1.0339731, + "balance_loss_mlp": 1.00430059, + "epoch": 0.34750195394697286, + "flos": 22346025644160.0, + "grad_norm": 1.9504733330297126, + "language_loss": 0.71817428, + "learning_rate": 3.032649376651228e-06, + "loss": 0.74008805, + "num_input_tokens_seen": 62091600, + "step": 2890, + "time_per_iteration": 3.9440555572509766 + }, + { + "auxiliary_loss_clip": 0.01121149, + "auxiliary_loss_mlp": 0.01086419, + "balance_loss_clip": 1.03285933, + "balance_loss_mlp": 1.00516558, + "epoch": 0.347622196837612, + "flos": 29095004885760.0, + "grad_norm": 2.3460501718725144, + "language_loss": 0.75896502, + "learning_rate": 3.031982192546238e-06, + "loss": 0.78104067, + "num_input_tokens_seen": 62114695, + "step": 2891, + "time_per_iteration": 3.9083080291748047 + }, + { + "auxiliary_loss_clip": 0.01144606, + "auxiliary_loss_mlp": 0.01087816, + "balance_loss_clip": 1.03755999, + "balance_loss_mlp": 1.00680089, + "epoch": 0.3477424397282511, + "flos": 22455732758400.0, + "grad_norm": 3.1985720657561134, + "language_loss": 0.94601953, + "learning_rate": 3.0313148518903696e-06, + "loss": 0.96834373, + "num_input_tokens_seen": 62134520, + "step": 2892, + "time_per_iteration": 2.84061598777771 + }, + { + "auxiliary_loss_clip": 0.01127296, + "auxiliary_loss_mlp": 0.0108512, + "balance_loss_clip": 1.0309037, + "balance_loss_mlp": 1.00415301, + "epoch": 0.34786268261889014, + "flos": 15778790242560.0, + "grad_norm": 2.5178918784872675, + "language_loss": 0.81329912, + "learning_rate": 3.030647354784859e-06, + "loss": 0.83542329, + "num_input_tokens_seen": 62151560, + "step": 2893, + "time_per_iteration": 2.8136701583862305 + }, + { + "auxiliary_loss_clip": 0.01118003, + "auxiliary_loss_mlp": 0.01085786, + "balance_loss_clip": 1.03040004, + "balance_loss_mlp": 1.00491464, + "epoch": 0.34798292550952925, + "flos": 20777627214720.0, + "grad_norm": 1.7901835363341858, + "language_loss": 0.77348518, + "learning_rate": 3.029979701330964e-06, + "loss": 0.79552305, + "num_input_tokens_seen": 62170985, + "step": 2894, + "time_per_iteration": 3.628984212875366 + }, + { + "auxiliary_loss_clip": 0.0113542, + "auxiliary_loss_mlp": 0.0108651, + "balance_loss_clip": 1.03734088, + "balance_loss_mlp": 1.00559103, + "epoch": 0.34810316840016836, + "flos": 19937820257280.0, + "grad_norm": 12.540360316649577, + "language_loss": 0.79868793, + "learning_rate": 3.029311891629966e-06, + "loss": 0.82090724, + "num_input_tokens_seen": 62189440, + "step": 2895, + "time_per_iteration": 2.834850311279297 + }, + { + "auxiliary_loss_clip": 0.01129371, + "auxiliary_loss_mlp": 0.01088933, + "balance_loss_clip": 1.03357816, + "balance_loss_mlp": 1.007918, + "epoch": 0.3482234112908074, + "flos": 23623296341760.0, + "grad_norm": 1.7170782485992964, + "language_loss": 0.74427497, + "learning_rate": 3.0286439257831744e-06, + "loss": 0.76645803, + "num_input_tokens_seen": 62208910, + "step": 2896, + "time_per_iteration": 2.792819023132324 + }, + { + "auxiliary_loss_clip": 0.01150128, + "auxiliary_loss_mlp": 0.01087854, + "balance_loss_clip": 1.03505385, + "balance_loss_mlp": 1.00650525, + "epoch": 0.3483436541814465, + "flos": 23986712194560.0, + "grad_norm": 1.9684466549438173, + "language_loss": 0.71955681, + "learning_rate": 3.0279758038919156e-06, + "loss": 0.74193656, + "num_input_tokens_seen": 62227135, + "step": 2897, + "time_per_iteration": 2.731247901916504 + }, + { + "auxiliary_loss_clip": 0.01141648, + "auxiliary_loss_mlp": 0.01087269, + "balance_loss_clip": 1.0363977, + "balance_loss_mlp": 1.00611126, + "epoch": 0.34846389707208564, + "flos": 22638338524800.0, + "grad_norm": 2.673254961340905, + "language_loss": 0.78583074, + "learning_rate": 3.0273075260575455e-06, + "loss": 0.80812001, + "num_input_tokens_seen": 62246035, + "step": 2898, + "time_per_iteration": 2.7606561183929443 + }, + { + "auxiliary_loss_clip": 0.01134345, + "auxiliary_loss_mlp": 0.01087847, + "balance_loss_clip": 1.03484607, + "balance_loss_mlp": 1.00673652, + "epoch": 0.3485841399627247, + "flos": 21792857218560.0, + "grad_norm": 1.9535634410740972, + "language_loss": 0.80847371, + "learning_rate": 3.0266390923814396e-06, + "loss": 0.83069557, + "num_input_tokens_seen": 62264095, + "step": 2899, + "time_per_iteration": 2.766552686691284 + }, + { + "auxiliary_loss_clip": 0.0112824, + "auxiliary_loss_mlp": 0.01086631, + "balance_loss_clip": 1.03285432, + "balance_loss_mlp": 1.00556874, + "epoch": 0.3487043828533638, + "flos": 17019036996480.0, + "grad_norm": 1.9964778564468049, + "language_loss": 0.81899852, + "learning_rate": 3.0259705029650008e-06, + "loss": 0.84114718, + "num_input_tokens_seen": 62282025, + "step": 2900, + "time_per_iteration": 2.7445993423461914 + }, + { + "auxiliary_loss_clip": 0.01144589, + "auxiliary_loss_mlp": 0.01084943, + "balance_loss_clip": 1.03774154, + "balance_loss_mlp": 1.00392866, + "epoch": 0.34882462574400286, + "flos": 22601135013120.0, + "grad_norm": 1.886405938142848, + "language_loss": 0.7323817, + "learning_rate": 3.025301757909652e-06, + "loss": 0.75467706, + "num_input_tokens_seen": 62302220, + "step": 2901, + "time_per_iteration": 2.6646265983581543 + }, + { + "auxiliary_loss_clip": 0.01120676, + "auxiliary_loss_mlp": 0.008736, + "balance_loss_clip": 1.03246665, + "balance_loss_mlp": 1.00029373, + "epoch": 0.34894486863464197, + "flos": 29861518141440.0, + "grad_norm": 1.9744677900286063, + "language_loss": 0.80547202, + "learning_rate": 3.024632857316842e-06, + "loss": 0.82541478, + "num_input_tokens_seen": 62323535, + "step": 2902, + "time_per_iteration": 2.868507146835327 + }, + { + "auxiliary_loss_clip": 0.01143583, + "auxiliary_loss_mlp": 0.01087562, + "balance_loss_clip": 1.03777671, + "balance_loss_mlp": 1.00640404, + "epoch": 0.3490651115252811, + "flos": 22122265870080.0, + "grad_norm": 4.125936810827608, + "language_loss": 0.77691567, + "learning_rate": 3.0239638012880412e-06, + "loss": 0.79922718, + "num_input_tokens_seen": 62343430, + "step": 2903, + "time_per_iteration": 2.70300555229187 + }, + { + "auxiliary_loss_clip": 0.01105327, + "auxiliary_loss_mlp": 0.0108638, + "balance_loss_clip": 1.03087544, + "balance_loss_mlp": 1.00522256, + "epoch": 0.34918535441592014, + "flos": 12676682943360.0, + "grad_norm": 4.3241330032735315, + "language_loss": 0.81018281, + "learning_rate": 3.0232945899247466e-06, + "loss": 0.83209991, + "num_input_tokens_seen": 62360365, + "step": 2904, + "time_per_iteration": 2.774251937866211 + }, + { + "auxiliary_loss_clip": 0.01144757, + "auxiliary_loss_mlp": 0.01086845, + "balance_loss_clip": 1.03754592, + "balance_loss_mlp": 1.0057354, + "epoch": 0.34930559730655925, + "flos": 23185617120000.0, + "grad_norm": 1.7273433813868337, + "language_loss": 0.7723518, + "learning_rate": 3.022625223328476e-06, + "loss": 0.79466784, + "num_input_tokens_seen": 62382105, + "step": 2905, + "time_per_iteration": 2.7362208366394043 + }, + { + "auxiliary_loss_clip": 0.01132442, + "auxiliary_loss_mlp": 0.01088081, + "balance_loss_clip": 1.03974628, + "balance_loss_mlp": 1.00687575, + "epoch": 0.34942584019719836, + "flos": 22855023319680.0, + "grad_norm": 1.5083776096539214, + "language_loss": 0.69123113, + "learning_rate": 3.0219557016007723e-06, + "loss": 0.71343637, + "num_input_tokens_seen": 62402235, + "step": 2906, + "time_per_iteration": 2.752763271331787 + }, + { + "auxiliary_loss_clip": 0.01136245, + "auxiliary_loss_mlp": 0.01086112, + "balance_loss_clip": 1.03213334, + "balance_loss_mlp": 1.00504923, + "epoch": 0.3495460830878374, + "flos": 24426043441920.0, + "grad_norm": 2.056528400158533, + "language_loss": 0.69856727, + "learning_rate": 3.021286024843202e-06, + "loss": 0.7207908, + "num_input_tokens_seen": 62420430, + "step": 2907, + "time_per_iteration": 2.7966630458831787 + }, + { + "auxiliary_loss_clip": 0.01163038, + "auxiliary_loss_mlp": 0.01080693, + "balance_loss_clip": 1.06339002, + "balance_loss_mlp": 1.00134742, + "epoch": 0.3496663259784765, + "flos": 70008749389440.0, + "grad_norm": 1.077791309569281, + "language_loss": 0.64766526, + "learning_rate": 3.0206161931573526e-06, + "loss": 0.6701026, + "num_input_tokens_seen": 62472980, + "step": 2908, + "time_per_iteration": 3.1616623401641846 + }, + { + "auxiliary_loss_clip": 0.01132382, + "auxiliary_loss_mlp": 0.01086248, + "balance_loss_clip": 1.03512323, + "balance_loss_mlp": 1.00528145, + "epoch": 0.34978656886911563, + "flos": 28692805322880.0, + "grad_norm": 1.9035330182716792, + "language_loss": 0.92846954, + "learning_rate": 3.0199462066448388e-06, + "loss": 0.95065594, + "num_input_tokens_seen": 62495175, + "step": 2909, + "time_per_iteration": 2.8180320262908936 + }, + { + "auxiliary_loss_clip": 0.01144613, + "auxiliary_loss_mlp": 0.01086507, + "balance_loss_clip": 1.03878188, + "balance_loss_mlp": 1.00525379, + "epoch": 0.3499068117597547, + "flos": 21142156389120.0, + "grad_norm": 1.8481018879060498, + "language_loss": 0.69164562, + "learning_rate": 3.019276065407296e-06, + "loss": 0.71395677, + "num_input_tokens_seen": 62514295, + "step": 2910, + "time_per_iteration": 2.7279226779937744 + }, + { + "auxiliary_loss_clip": 0.01111081, + "auxiliary_loss_mlp": 0.01087653, + "balance_loss_clip": 1.03220963, + "balance_loss_mlp": 1.006495, + "epoch": 0.3500270546503938, + "flos": 22782699285120.0, + "grad_norm": 2.8926907050420496, + "language_loss": 0.80398643, + "learning_rate": 3.018605769546385e-06, + "loss": 0.82597375, + "num_input_tokens_seen": 62534850, + "step": 2911, + "time_per_iteration": 2.879192352294922 + }, + { + "auxiliary_loss_clip": 0.01145029, + "auxiliary_loss_mlp": 0.01087855, + "balance_loss_clip": 1.03789651, + "balance_loss_mlp": 1.00669742, + "epoch": 0.3501472975410329, + "flos": 22894058424960.0, + "grad_norm": 2.1282100884685486, + "language_loss": 0.79566461, + "learning_rate": 3.017935319163788e-06, + "loss": 0.81799352, + "num_input_tokens_seen": 62553810, + "step": 2912, + "time_per_iteration": 3.649451494216919 + }, + { + "auxiliary_loss_clip": 0.01144234, + "auxiliary_loss_mlp": 0.01085718, + "balance_loss_clip": 1.03806496, + "balance_loss_mlp": 1.00446486, + "epoch": 0.35026754043167196, + "flos": 25446588658560.0, + "grad_norm": 1.704645971689998, + "language_loss": 0.70555866, + "learning_rate": 3.017264714361213e-06, + "loss": 0.72785819, + "num_input_tokens_seen": 62573460, + "step": 2913, + "time_per_iteration": 2.728334426879883 + }, + { + "auxiliary_loss_clip": 0.01129367, + "auxiliary_loss_mlp": 0.00873669, + "balance_loss_clip": 1.03215504, + "balance_loss_mlp": 1.00040984, + "epoch": 0.3503877833223111, + "flos": 19573757959680.0, + "grad_norm": 2.2314472473984437, + "language_loss": 0.82299924, + "learning_rate": 3.016593955240389e-06, + "loss": 0.84302956, + "num_input_tokens_seen": 62592150, + "step": 2914, + "time_per_iteration": 2.804252862930298 + }, + { + "auxiliary_loss_clip": 0.01155473, + "auxiliary_loss_mlp": 0.01079791, + "balance_loss_clip": 1.06452918, + "balance_loss_mlp": 1.00044489, + "epoch": 0.3505080262129502, + "flos": 65072075880960.0, + "grad_norm": 0.81830456686631, + "language_loss": 0.63740653, + "learning_rate": 3.015923041903071e-06, + "loss": 0.65975916, + "num_input_tokens_seen": 62658275, + "step": 2915, + "time_per_iteration": 4.303407907485962 + }, + { + "auxiliary_loss_clip": 0.01144288, + "auxiliary_loss_mlp": 0.01086756, + "balance_loss_clip": 1.03911388, + "balance_loss_mlp": 1.0057894, + "epoch": 0.35062826910358924, + "flos": 29314562768640.0, + "grad_norm": 2.6823087850117417, + "language_loss": 0.83723235, + "learning_rate": 3.0152519744510347e-06, + "loss": 0.85954285, + "num_input_tokens_seen": 62678075, + "step": 2916, + "time_per_iteration": 2.768653392791748 + }, + { + "auxiliary_loss_clip": 0.0112069, + "auxiliary_loss_mlp": 0.01087305, + "balance_loss_clip": 1.03168046, + "balance_loss_mlp": 1.00624287, + "epoch": 0.35074851199422835, + "flos": 23987717775360.0, + "grad_norm": 1.8411775236627261, + "language_loss": 0.82298791, + "learning_rate": 3.014580752986081e-06, + "loss": 0.84506786, + "num_input_tokens_seen": 62696950, + "step": 2917, + "time_per_iteration": 4.192214727401733 + }, + { + "auxiliary_loss_clip": 0.01114805, + "auxiliary_loss_mlp": 0.01087842, + "balance_loss_clip": 1.03599858, + "balance_loss_mlp": 1.0068748, + "epoch": 0.3508687548848674, + "flos": 15224436668160.0, + "grad_norm": 1.9986265175015585, + "language_loss": 0.78375149, + "learning_rate": 3.0139093776100345e-06, + "loss": 0.80577797, + "num_input_tokens_seen": 62713540, + "step": 2918, + "time_per_iteration": 2.8007283210754395 + }, + { + "auxiliary_loss_clip": 0.01153299, + "auxiliary_loss_mlp": 0.01086312, + "balance_loss_clip": 1.03858769, + "balance_loss_mlp": 1.00539303, + "epoch": 0.3509889977755065, + "flos": 21361750185600.0, + "grad_norm": 1.9063530556401014, + "language_loss": 0.75530952, + "learning_rate": 3.013237848424741e-06, + "loss": 0.77770567, + "num_input_tokens_seen": 62732925, + "step": 2919, + "time_per_iteration": 3.636317014694214 + }, + { + "auxiliary_loss_clip": 0.01134683, + "auxiliary_loss_mlp": 0.010858, + "balance_loss_clip": 1.03755629, + "balance_loss_mlp": 1.00473809, + "epoch": 0.35110924066614563, + "flos": 19135360465920.0, + "grad_norm": 2.6322551147722884, + "language_loss": 0.75937539, + "learning_rate": 3.012566165532072e-06, + "loss": 0.78158021, + "num_input_tokens_seen": 62751715, + "step": 2920, + "time_per_iteration": 2.959629535675049 + }, + { + "auxiliary_loss_clip": 0.0108665, + "auxiliary_loss_mlp": 0.01086967, + "balance_loss_clip": 1.03051519, + "balance_loss_mlp": 1.00590444, + "epoch": 0.3512294835567847, + "flos": 21980885938560.0, + "grad_norm": 2.0501190413354826, + "language_loss": 0.77022052, + "learning_rate": 3.0118943290339207e-06, + "loss": 0.79195666, + "num_input_tokens_seen": 62771925, + "step": 2921, + "time_per_iteration": 2.875765323638916 + }, + { + "auxiliary_loss_clip": 0.01126639, + "auxiliary_loss_mlp": 0.01086433, + "balance_loss_clip": 1.03669882, + "balance_loss_mlp": 1.00537109, + "epoch": 0.3513497264474238, + "flos": 17817294896640.0, + "grad_norm": 1.8374367389848785, + "language_loss": 0.68052614, + "learning_rate": 3.011222339032204e-06, + "loss": 0.70265692, + "num_input_tokens_seen": 62790075, + "step": 2922, + "time_per_iteration": 2.9081783294677734 + }, + { + "auxiliary_loss_clip": 0.01153951, + "auxiliary_loss_mlp": 0.01086183, + "balance_loss_clip": 1.03941822, + "balance_loss_mlp": 1.00512064, + "epoch": 0.3514699693380629, + "flos": 26943417239040.0, + "grad_norm": 1.8088175566803568, + "language_loss": 0.69480479, + "learning_rate": 3.0105501956288626e-06, + "loss": 0.71720606, + "num_input_tokens_seen": 62810545, + "step": 2923, + "time_per_iteration": 2.7152369022369385 + }, + { + "auxiliary_loss_clip": 0.01145263, + "auxiliary_loss_mlp": 0.01086711, + "balance_loss_clip": 1.03832579, + "balance_loss_mlp": 1.00541019, + "epoch": 0.35159021222870196, + "flos": 15267565923840.0, + "grad_norm": 2.1890178270202427, + "language_loss": 0.72228479, + "learning_rate": 3.0098778989258602e-06, + "loss": 0.74460453, + "num_input_tokens_seen": 62829155, + "step": 2924, + "time_per_iteration": 2.6974167823791504 + }, + { + "auxiliary_loss_clip": 0.01126444, + "auxiliary_loss_mlp": 0.01087521, + "balance_loss_clip": 1.03807116, + "balance_loss_mlp": 1.00631595, + "epoch": 0.35171045511934107, + "flos": 13984154000640.0, + "grad_norm": 2.1007528761290124, + "language_loss": 0.88451731, + "learning_rate": 3.009205449025183e-06, + "loss": 0.90665698, + "num_input_tokens_seen": 62845350, + "step": 2925, + "time_per_iteration": 2.7356104850769043 + }, + { + "auxiliary_loss_clip": 0.01113262, + "auxiliary_loss_mlp": 0.01087901, + "balance_loss_clip": 1.03767419, + "balance_loss_mlp": 1.006791, + "epoch": 0.3518306980099802, + "flos": 14283434119680.0, + "grad_norm": 2.1015192203339006, + "language_loss": 0.63312399, + "learning_rate": 3.008532846028842e-06, + "loss": 0.65513563, + "num_input_tokens_seen": 62862110, + "step": 2926, + "time_per_iteration": 2.87166690826416 + }, + { + "auxiliary_loss_clip": 0.01154351, + "auxiliary_loss_mlp": 0.01087824, + "balance_loss_clip": 1.03963709, + "balance_loss_mlp": 1.00661826, + "epoch": 0.35195094090061924, + "flos": 27052872958080.0, + "grad_norm": 2.5689502287694514, + "language_loss": 0.71905082, + "learning_rate": 3.0078600900388694e-06, + "loss": 0.7414726, + "num_input_tokens_seen": 62882415, + "step": 2927, + "time_per_iteration": 2.7437267303466797 + }, + { + "auxiliary_loss_clip": 0.01123371, + "auxiliary_loss_mlp": 0.01086427, + "balance_loss_clip": 1.03435731, + "balance_loss_mlp": 1.00541282, + "epoch": 0.35207118379125835, + "flos": 25629266252160.0, + "grad_norm": 2.3404538428638237, + "language_loss": 0.73846745, + "learning_rate": 3.007187181157323e-06, + "loss": 0.7605654, + "num_input_tokens_seen": 62902425, + "step": 2928, + "time_per_iteration": 2.8072361946105957 + }, + { + "auxiliary_loss_clip": 0.01100286, + "auxiliary_loss_mlp": 0.01085588, + "balance_loss_clip": 1.02941179, + "balance_loss_mlp": 1.00452602, + "epoch": 0.35219142668189746, + "flos": 18004713085440.0, + "grad_norm": 11.237327917258614, + "language_loss": 0.67764616, + "learning_rate": 3.006514119486282e-06, + "loss": 0.69950485, + "num_input_tokens_seen": 62919255, + "step": 2929, + "time_per_iteration": 2.869265079498291 + }, + { + "auxiliary_loss_clip": 0.01120345, + "auxiliary_loss_mlp": 0.01085641, + "balance_loss_clip": 1.03224134, + "balance_loss_mlp": 1.004722, + "epoch": 0.3523116695725365, + "flos": 14028109269120.0, + "grad_norm": 1.7851721014873712, + "language_loss": 0.69307721, + "learning_rate": 3.005840905127849e-06, + "loss": 0.715137, + "num_input_tokens_seen": 62936160, + "step": 2930, + "time_per_iteration": 2.772181749343872 + }, + { + "auxiliary_loss_clip": 0.01153039, + "auxiliary_loss_mlp": 0.0108728, + "balance_loss_clip": 1.03847516, + "balance_loss_mlp": 1.00631261, + "epoch": 0.3524319124631756, + "flos": 21433966479360.0, + "grad_norm": 2.1387067556702593, + "language_loss": 0.86855698, + "learning_rate": 3.0051675381841516e-06, + "loss": 0.89096016, + "num_input_tokens_seen": 62953470, + "step": 2931, + "time_per_iteration": 2.733459949493408 + }, + { + "auxiliary_loss_clip": 0.01079388, + "auxiliary_loss_mlp": 0.00873572, + "balance_loss_clip": 1.02970397, + "balance_loss_mlp": 1.00036073, + "epoch": 0.3525521553538147, + "flos": 26322773114880.0, + "grad_norm": 2.0561188198641123, + "language_loss": 0.76810187, + "learning_rate": 3.0044940187573363e-06, + "loss": 0.78763151, + "num_input_tokens_seen": 62974480, + "step": 2932, + "time_per_iteration": 2.9830029010772705 + }, + { + "auxiliary_loss_clip": 0.01145589, + "auxiliary_loss_mlp": 0.01087091, + "balance_loss_clip": 1.0390166, + "balance_loss_mlp": 1.00602841, + "epoch": 0.3526723982444538, + "flos": 21543314457600.0, + "grad_norm": 2.0127887896601395, + "language_loss": 0.64812577, + "learning_rate": 3.003820346949578e-06, + "loss": 0.67045259, + "num_input_tokens_seen": 62992560, + "step": 2933, + "time_per_iteration": 2.7575573921203613 + }, + { + "auxiliary_loss_clip": 0.01150591, + "auxiliary_loss_mlp": 0.01086775, + "balance_loss_clip": 1.0357095, + "balance_loss_mlp": 1.00576019, + "epoch": 0.3527926411350929, + "flos": 23733649900800.0, + "grad_norm": 2.0224854889957093, + "language_loss": 0.79222894, + "learning_rate": 3.003146522863071e-06, + "loss": 0.81460261, + "num_input_tokens_seen": 63013445, + "step": 2934, + "time_per_iteration": 2.758167266845703 + }, + { + "auxiliary_loss_clip": 0.01129763, + "auxiliary_loss_mlp": 0.01087252, + "balance_loss_clip": 1.03394723, + "balance_loss_mlp": 1.00623775, + "epoch": 0.35291288402573195, + "flos": 30445461544320.0, + "grad_norm": 2.2597819520019207, + "language_loss": 0.85812962, + "learning_rate": 3.0024725466000345e-06, + "loss": 0.88029981, + "num_input_tokens_seen": 63033400, + "step": 2935, + "time_per_iteration": 2.8931076526641846 + }, + { + "auxiliary_loss_clip": 0.01144204, + "auxiliary_loss_mlp": 0.01086458, + "balance_loss_clip": 1.03915644, + "balance_loss_mlp": 1.00539565, + "epoch": 0.35303312691637107, + "flos": 23112179763840.0, + "grad_norm": 2.3888120935806287, + "language_loss": 0.78790009, + "learning_rate": 3.0017984182627087e-06, + "loss": 0.81020671, + "num_input_tokens_seen": 63052725, + "step": 2936, + "time_per_iteration": 2.7799532413482666 + }, + { + "auxiliary_loss_clip": 0.01125966, + "auxiliary_loss_mlp": 0.0087353, + "balance_loss_clip": 1.0359683, + "balance_loss_mlp": 1.00037551, + "epoch": 0.3531533698070102, + "flos": 21835699165440.0, + "grad_norm": 2.0118530247470643, + "language_loss": 0.82335359, + "learning_rate": 3.00112413795336e-06, + "loss": 0.84334856, + "num_input_tokens_seen": 63072560, + "step": 2937, + "time_per_iteration": 3.785931348800659 + }, + { + "auxiliary_loss_clip": 0.01135781, + "auxiliary_loss_mlp": 0.01086515, + "balance_loss_clip": 1.03665066, + "balance_loss_mlp": 1.00535727, + "epoch": 0.35327361269764923, + "flos": 15778969810560.0, + "grad_norm": 3.0508126055116054, + "language_loss": 0.80051148, + "learning_rate": 3.000449705774275e-06, + "loss": 0.82273442, + "num_input_tokens_seen": 63090800, + "step": 2938, + "time_per_iteration": 2.722266674041748 + }, + { + "auxiliary_loss_clip": 0.01143496, + "auxiliary_loss_mlp": 0.0108644, + "balance_loss_clip": 1.03754771, + "balance_loss_mlp": 1.00542521, + "epoch": 0.35339385558828834, + "flos": 22090413484800.0, + "grad_norm": 1.9499825504195987, + "language_loss": 0.71711826, + "learning_rate": 2.9997751218277654e-06, + "loss": 0.73941761, + "num_input_tokens_seen": 63108955, + "step": 2939, + "time_per_iteration": 2.7475337982177734 + }, + { + "auxiliary_loss_clip": 0.0115404, + "auxiliary_loss_mlp": 0.01086156, + "balance_loss_clip": 1.03938103, + "balance_loss_mlp": 1.00514102, + "epoch": 0.35351409847892745, + "flos": 24165008328960.0, + "grad_norm": 2.026129101420854, + "language_loss": 0.78064275, + "learning_rate": 2.999100386216166e-06, + "loss": 0.80304468, + "num_input_tokens_seen": 63127895, + "step": 2940, + "time_per_iteration": 3.6842572689056396 + }, + { + "auxiliary_loss_clip": 0.01126944, + "auxiliary_loss_mlp": 0.01086082, + "balance_loss_clip": 1.03482819, + "balance_loss_mlp": 1.00502002, + "epoch": 0.3536343413695665, + "flos": 27052298340480.0, + "grad_norm": 1.908137392312301, + "language_loss": 0.74141526, + "learning_rate": 2.998425499041831e-06, + "loss": 0.76354551, + "num_input_tokens_seen": 63148410, + "step": 2941, + "time_per_iteration": 2.8322126865386963 + }, + { + "auxiliary_loss_clip": 0.01150169, + "auxiliary_loss_mlp": 0.01080025, + "balance_loss_clip": 1.05884278, + "balance_loss_mlp": 1.00067925, + "epoch": 0.3537545842602056, + "flos": 65991066370560.0, + "grad_norm": 1.2584092479067286, + "language_loss": 0.64549464, + "learning_rate": 2.997750460407142e-06, + "loss": 0.66779655, + "num_input_tokens_seen": 63209765, + "step": 2942, + "time_per_iteration": 4.358407258987427 + }, + { + "auxiliary_loss_clip": 0.01124045, + "auxiliary_loss_mlp": 0.01087468, + "balance_loss_clip": 1.03455126, + "balance_loss_mlp": 1.00611925, + "epoch": 0.35387482715084473, + "flos": 18436897526400.0, + "grad_norm": 2.197882242452801, + "language_loss": 0.69963455, + "learning_rate": 2.997075270414501e-06, + "loss": 0.72174966, + "num_input_tokens_seen": 63226980, + "step": 2943, + "time_per_iteration": 2.811666488647461 + }, + { + "auxiliary_loss_clip": 0.01138199, + "auxiliary_loss_mlp": 0.01080089, + "balance_loss_clip": 1.05530214, + "balance_loss_mlp": 1.00074291, + "epoch": 0.3539950700414838, + "flos": 65588579498880.0, + "grad_norm": 0.6948263825042111, + "language_loss": 0.57762897, + "learning_rate": 2.9963999291663347e-06, + "loss": 0.59981179, + "num_input_tokens_seen": 63292760, + "step": 2944, + "time_per_iteration": 3.3477814197540283 + }, + { + "auxiliary_loss_clip": 0.01113518, + "auxiliary_loss_mlp": 0.01085523, + "balance_loss_clip": 1.03483224, + "balance_loss_mlp": 1.00436521, + "epoch": 0.3541153129321229, + "flos": 20521655919360.0, + "grad_norm": 4.035404774445957, + "language_loss": 0.73877496, + "learning_rate": 2.9957244367650915e-06, + "loss": 0.76076537, + "num_input_tokens_seen": 63309005, + "step": 2945, + "time_per_iteration": 3.811192750930786 + }, + { + "auxiliary_loss_clip": 0.01110926, + "auxiliary_loss_mlp": 0.01087095, + "balance_loss_clip": 1.03160381, + "balance_loss_mlp": 1.00622344, + "epoch": 0.354235555822762, + "flos": 19573578391680.0, + "grad_norm": 2.1142983422482824, + "language_loss": 0.83939385, + "learning_rate": 2.9950487933132425e-06, + "loss": 0.86137402, + "num_input_tokens_seen": 63326420, + "step": 2946, + "time_per_iteration": 2.9178948402404785 + }, + { + "auxiliary_loss_clip": 0.01144482, + "auxiliary_loss_mlp": 0.01087915, + "balance_loss_clip": 1.03802609, + "balance_loss_mlp": 1.00675702, + "epoch": 0.35435579871340106, + "flos": 20777268078720.0, + "grad_norm": 2.093252985660521, + "language_loss": 0.71107471, + "learning_rate": 2.994372998913283e-06, + "loss": 0.73339868, + "num_input_tokens_seen": 63344925, + "step": 2947, + "time_per_iteration": 2.8570265769958496 + }, + { + "auxiliary_loss_clip": 0.01130788, + "auxiliary_loss_mlp": 0.01087254, + "balance_loss_clip": 1.03479445, + "balance_loss_mlp": 1.00614357, + "epoch": 0.35447604160404017, + "flos": 23951807153280.0, + "grad_norm": 2.544253555676656, + "language_loss": 0.62305593, + "learning_rate": 2.99369705366773e-06, + "loss": 0.64523637, + "num_input_tokens_seen": 63365170, + "step": 2948, + "time_per_iteration": 2.8187906742095947 + }, + { + "auxiliary_loss_clip": 0.01127363, + "auxiliary_loss_mlp": 0.01087914, + "balance_loss_clip": 1.03148437, + "balance_loss_mlp": 1.0066613, + "epoch": 0.3545962844946792, + "flos": 23435662671360.0, + "grad_norm": 2.79972950544302, + "language_loss": 0.82397902, + "learning_rate": 2.9930209576791244e-06, + "loss": 0.8461318, + "num_input_tokens_seen": 63383645, + "step": 2949, + "time_per_iteration": 2.8941876888275146 + }, + { + "auxiliary_loss_clip": 0.01139977, + "auxiliary_loss_mlp": 0.01085297, + "balance_loss_clip": 1.03497696, + "balance_loss_mlp": 1.00432968, + "epoch": 0.35471652738531834, + "flos": 22085134185600.0, + "grad_norm": 2.852866246095258, + "language_loss": 0.63629091, + "learning_rate": 2.9923447110500285e-06, + "loss": 0.65854371, + "num_input_tokens_seen": 63402390, + "step": 2950, + "time_per_iteration": 2.891051769256592 + }, + { + "auxiliary_loss_clip": 0.01142838, + "auxiliary_loss_mlp": 0.01086091, + "balance_loss_clip": 1.03646111, + "balance_loss_mlp": 1.00512397, + "epoch": 0.35483677027595745, + "flos": 27341881787520.0, + "grad_norm": 1.3943841454754968, + "language_loss": 0.75253516, + "learning_rate": 2.9916683138830295e-06, + "loss": 0.7748245, + "num_input_tokens_seen": 63423055, + "step": 2951, + "time_per_iteration": 2.7636096477508545 + }, + { + "auxiliary_loss_clip": 0.01129187, + "auxiliary_loss_mlp": 0.01087102, + "balance_loss_clip": 1.03337336, + "balance_loss_mlp": 1.00599217, + "epoch": 0.3549570131665965, + "flos": 13516166678400.0, + "grad_norm": 2.0406998107341994, + "language_loss": 0.80723089, + "learning_rate": 2.9909917662807353e-06, + "loss": 0.82939374, + "num_input_tokens_seen": 63440855, + "step": 2952, + "time_per_iteration": 2.8156628608703613 + }, + { + "auxiliary_loss_clip": 0.01138503, + "auxiliary_loss_mlp": 0.01086958, + "balance_loss_clip": 1.03396463, + "balance_loss_mlp": 1.0058006, + "epoch": 0.3550772560572356, + "flos": 20887549810560.0, + "grad_norm": 2.3890959342885116, + "language_loss": 0.69605368, + "learning_rate": 2.9903150683457783e-06, + "loss": 0.71830827, + "num_input_tokens_seen": 63459400, + "step": 2953, + "time_per_iteration": 2.7413952350616455 + }, + { + "auxiliary_loss_clip": 0.01116407, + "auxiliary_loss_mlp": 0.01086793, + "balance_loss_clip": 1.03373337, + "balance_loss_mlp": 1.00573063, + "epoch": 0.3551974989478747, + "flos": 20194042947840.0, + "grad_norm": 2.4220318132860137, + "language_loss": 0.64877665, + "learning_rate": 2.9896382201808126e-06, + "loss": 0.67080867, + "num_input_tokens_seen": 63476800, + "step": 2954, + "time_per_iteration": 2.7801830768585205 + }, + { + "auxiliary_loss_clip": 0.01152723, + "auxiliary_loss_mlp": 0.01089156, + "balance_loss_clip": 1.03744042, + "balance_loss_mlp": 1.00795043, + "epoch": 0.3553177418385138, + "flos": 19828831415040.0, + "grad_norm": 2.2514090789919883, + "language_loss": 0.81142038, + "learning_rate": 2.988961221888516e-06, + "loss": 0.83383918, + "num_input_tokens_seen": 63493475, + "step": 2955, + "time_per_iteration": 2.735063076019287 + }, + { + "auxiliary_loss_clip": 0.01121778, + "auxiliary_loss_mlp": 0.01087022, + "balance_loss_clip": 1.03356838, + "balance_loss_mlp": 1.00586414, + "epoch": 0.3554379847291529, + "flos": 14829132516480.0, + "grad_norm": 2.625423077284863, + "language_loss": 0.78890091, + "learning_rate": 2.988284073571589e-06, + "loss": 0.81098896, + "num_input_tokens_seen": 63509560, + "step": 2956, + "time_per_iteration": 2.829662561416626 + }, + { + "auxiliary_loss_clip": 0.01143103, + "auxiliary_loss_mlp": 0.0087358, + "balance_loss_clip": 1.03665352, + "balance_loss_mlp": 1.00033736, + "epoch": 0.355558227619792, + "flos": 20485350247680.0, + "grad_norm": 2.209282613662136, + "language_loss": 0.72594213, + "learning_rate": 2.9876067753327528e-06, + "loss": 0.74610895, + "num_input_tokens_seen": 63527290, + "step": 2957, + "time_per_iteration": 2.744189500808716 + }, + { + "auxiliary_loss_clip": 0.01129735, + "auxiliary_loss_mlp": 0.0108629, + "balance_loss_clip": 1.03727448, + "balance_loss_mlp": 1.00503683, + "epoch": 0.35567847051043106, + "flos": 37663613256960.0, + "grad_norm": 2.015322324247417, + "language_loss": 0.80454946, + "learning_rate": 2.986929327274754e-06, + "loss": 0.82670975, + "num_input_tokens_seen": 63547870, + "step": 2958, + "time_per_iteration": 2.901942014694214 + }, + { + "auxiliary_loss_clip": 0.01140782, + "auxiliary_loss_mlp": 0.01088714, + "balance_loss_clip": 1.03586185, + "balance_loss_mlp": 1.00774717, + "epoch": 0.35579871340107017, + "flos": 26943058103040.0, + "grad_norm": 1.6221897566404566, + "language_loss": 0.78740764, + "learning_rate": 2.9862517295003617e-06, + "loss": 0.80970258, + "num_input_tokens_seen": 63568285, + "step": 2959, + "time_per_iteration": 2.77517032623291 + }, + { + "auxiliary_loss_clip": 0.01123238, + "auxiliary_loss_mlp": 0.01086038, + "balance_loss_clip": 1.03373075, + "balance_loss_mlp": 1.00492787, + "epoch": 0.3559189562917093, + "flos": 28293335193600.0, + "grad_norm": 1.9282843920470012, + "language_loss": 0.72109616, + "learning_rate": 2.9855739821123654e-06, + "loss": 0.74318892, + "num_input_tokens_seen": 63589865, + "step": 2960, + "time_per_iteration": 2.880641460418701 + }, + { + "auxiliary_loss_clip": 0.0113675, + "auxiliary_loss_mlp": 0.0108576, + "balance_loss_clip": 1.03277636, + "balance_loss_mlp": 1.00474477, + "epoch": 0.35603919918234833, + "flos": 25664063552640.0, + "grad_norm": 1.8673182778837785, + "language_loss": 0.82160795, + "learning_rate": 2.98489608521358e-06, + "loss": 0.84383309, + "num_input_tokens_seen": 63609805, + "step": 2961, + "time_per_iteration": 2.775850296020508 + }, + { + "auxiliary_loss_clip": 0.01144721, + "auxiliary_loss_mlp": 0.00873587, + "balance_loss_clip": 1.03772545, + "balance_loss_mlp": 1.00040817, + "epoch": 0.35615944207298744, + "flos": 23000856537600.0, + "grad_norm": 1.939423129739608, + "language_loss": 0.79582739, + "learning_rate": 2.9842180389068425e-06, + "loss": 0.81601042, + "num_input_tokens_seen": 63627115, + "step": 2962, + "time_per_iteration": 3.695960521697998 + }, + { + "auxiliary_loss_clip": 0.01113679, + "auxiliary_loss_mlp": 0.01082504, + "balance_loss_clip": 1.03913713, + "balance_loss_mlp": 1.00315785, + "epoch": 0.35627968496362655, + "flos": 68251283723520.0, + "grad_norm": 0.8534897586772983, + "language_loss": 0.59219533, + "learning_rate": 2.98353984329501e-06, + "loss": 0.61415714, + "num_input_tokens_seen": 63691460, + "step": 2963, + "time_per_iteration": 3.4148778915405273 + }, + { + "auxiliary_loss_clip": 0.01130505, + "auxiliary_loss_mlp": 0.01087037, + "balance_loss_clip": 1.03448462, + "balance_loss_mlp": 1.00583196, + "epoch": 0.3563999278542656, + "flos": 22641714403200.0, + "grad_norm": 1.6810083869494064, + "language_loss": 0.70581478, + "learning_rate": 2.982861498480965e-06, + "loss": 0.72799021, + "num_input_tokens_seen": 63713840, + "step": 2964, + "time_per_iteration": 2.799144983291626 + }, + { + "auxiliary_loss_clip": 0.01124395, + "auxiliary_loss_mlp": 0.01087235, + "balance_loss_clip": 1.03458261, + "balance_loss_mlp": 1.0061723, + "epoch": 0.3565201707449047, + "flos": 25952533678080.0, + "grad_norm": 1.618636996730611, + "language_loss": 0.82516301, + "learning_rate": 2.9821830045676122e-06, + "loss": 0.84727925, + "num_input_tokens_seen": 63733540, + "step": 2965, + "time_per_iteration": 3.6619768142700195 + }, + { + "auxiliary_loss_clip": 0.01153636, + "auxiliary_loss_mlp": 0.01088376, + "balance_loss_clip": 1.03862214, + "balance_loss_mlp": 1.00721788, + "epoch": 0.3566404136355438, + "flos": 28475725478400.0, + "grad_norm": 1.6453601603684374, + "language_loss": 0.72986484, + "learning_rate": 2.9815043616578793e-06, + "loss": 0.75228488, + "num_input_tokens_seen": 63754335, + "step": 2966, + "time_per_iteration": 2.710204601287842 + }, + { + "auxiliary_loss_clip": 0.01125085, + "auxiliary_loss_mlp": 0.01088114, + "balance_loss_clip": 1.03533769, + "balance_loss_mlp": 1.00676513, + "epoch": 0.3567606565261829, + "flos": 38363117690880.0, + "grad_norm": 2.2749400813182588, + "language_loss": 0.76671815, + "learning_rate": 2.9808255698547145e-06, + "loss": 0.78885007, + "num_input_tokens_seen": 63777135, + "step": 2967, + "time_per_iteration": 2.841561794281006 + }, + { + "auxiliary_loss_clip": 0.01143549, + "auxiliary_loss_mlp": 0.01086995, + "balance_loss_clip": 1.03838086, + "balance_loss_mlp": 1.00593245, + "epoch": 0.356880899416822, + "flos": 21981029592960.0, + "grad_norm": 2.1888317430203377, + "language_loss": 0.79680729, + "learning_rate": 2.9801466292610913e-06, + "loss": 0.81911272, + "num_input_tokens_seen": 63797020, + "step": 2968, + "time_per_iteration": 3.491314649581909 + }, + { + "auxiliary_loss_clip": 0.01140892, + "auxiliary_loss_mlp": 0.01086547, + "balance_loss_clip": 1.03500545, + "balance_loss_mlp": 1.00548494, + "epoch": 0.35700114230746105, + "flos": 18989132198400.0, + "grad_norm": 2.54181180246158, + "language_loss": 0.80955327, + "learning_rate": 2.979467539980003e-06, + "loss": 0.83182764, + "num_input_tokens_seen": 63813810, + "step": 2969, + "time_per_iteration": 2.773374557495117 + }, + { + "auxiliary_loss_clip": 0.01143056, + "auxiliary_loss_mlp": 0.0108646, + "balance_loss_clip": 1.03661036, + "balance_loss_mlp": 1.00525498, + "epoch": 0.35712138519810016, + "flos": 19756112330880.0, + "grad_norm": 1.9896977299418626, + "language_loss": 0.76863825, + "learning_rate": 2.978788302114468e-06, + "loss": 0.79093337, + "num_input_tokens_seen": 63830925, + "step": 2970, + "time_per_iteration": 3.5751161575317383 + }, + { + "auxiliary_loss_clip": 0.01135296, + "auxiliary_loss_mlp": 0.0108754, + "balance_loss_clip": 1.03518915, + "balance_loss_mlp": 1.00628674, + "epoch": 0.35724162808873927, + "flos": 35183012008320.0, + "grad_norm": 4.666875734930108, + "language_loss": 0.83242261, + "learning_rate": 2.9781089157675255e-06, + "loss": 0.85465097, + "num_input_tokens_seen": 63849385, + "step": 2971, + "time_per_iteration": 2.816617965698242 + }, + { + "auxiliary_loss_clip": 0.01134403, + "auxiliary_loss_mlp": 0.01085894, + "balance_loss_clip": 1.03102899, + "balance_loss_mlp": 1.00492728, + "epoch": 0.3573618709793783, + "flos": 25556726736000.0, + "grad_norm": 1.532136932679674, + "language_loss": 0.88202655, + "learning_rate": 2.977429381042238e-06, + "loss": 0.90422952, + "num_input_tokens_seen": 63870060, + "step": 2972, + "time_per_iteration": 2.8204638957977295 + }, + { + "auxiliary_loss_clip": 0.0113319, + "auxiliary_loss_mlp": 0.0108526, + "balance_loss_clip": 1.03588212, + "balance_loss_mlp": 1.00438881, + "epoch": 0.35748211387001744, + "flos": 29132352051840.0, + "grad_norm": 2.112633793997536, + "language_loss": 0.89011919, + "learning_rate": 2.9767496980416913e-06, + "loss": 0.91230363, + "num_input_tokens_seen": 63889355, + "step": 2973, + "time_per_iteration": 2.7770276069641113 + }, + { + "auxiliary_loss_clip": 0.01129957, + "auxiliary_loss_mlp": 0.01086114, + "balance_loss_clip": 1.03276491, + "balance_loss_mlp": 1.0049088, + "epoch": 0.35760235676065655, + "flos": 13954169122560.0, + "grad_norm": 2.5697070875254067, + "language_loss": 0.80995983, + "learning_rate": 2.9760698668689914e-06, + "loss": 0.8321206, + "num_input_tokens_seen": 63905580, + "step": 2974, + "time_per_iteration": 2.706341028213501 + }, + { + "auxiliary_loss_clip": 0.01141067, + "auxiliary_loss_mlp": 0.01087169, + "balance_loss_clip": 1.03473186, + "balance_loss_mlp": 1.00615382, + "epoch": 0.3577225996512956, + "flos": 44018688977280.0, + "grad_norm": 1.7461755502799474, + "language_loss": 0.71347046, + "learning_rate": 2.975389887627269e-06, + "loss": 0.73575282, + "num_input_tokens_seen": 63928180, + "step": 2975, + "time_per_iteration": 2.8774194717407227 + }, + { + "auxiliary_loss_clip": 0.01123604, + "auxiliary_loss_mlp": 0.01087262, + "balance_loss_clip": 1.03444719, + "balance_loss_mlp": 1.00619924, + "epoch": 0.3578428425419347, + "flos": 17055199013760.0, + "grad_norm": 2.059371417732999, + "language_loss": 0.89841616, + "learning_rate": 2.9747097604196764e-06, + "loss": 0.92052484, + "num_input_tokens_seen": 63944825, + "step": 2976, + "time_per_iteration": 2.847444534301758 + }, + { + "auxiliary_loss_clip": 0.01117491, + "auxiliary_loss_mlp": 0.01080009, + "balance_loss_clip": 1.05089462, + "balance_loss_mlp": 1.00066364, + "epoch": 0.3579630854325738, + "flos": 71676550707840.0, + "grad_norm": 0.7580146170776446, + "language_loss": 0.56665689, + "learning_rate": 2.9740294853493875e-06, + "loss": 0.58863187, + "num_input_tokens_seen": 64016385, + "step": 2977, + "time_per_iteration": 3.710871934890747 + }, + { + "auxiliary_loss_clip": 0.01112837, + "auxiliary_loss_mlp": 0.01086707, + "balance_loss_clip": 1.03167105, + "balance_loss_mlp": 1.00569201, + "epoch": 0.3580833283232129, + "flos": 25046651652480.0, + "grad_norm": 1.892229044950576, + "language_loss": 0.66821444, + "learning_rate": 2.9733490625196008e-06, + "loss": 0.69020987, + "num_input_tokens_seen": 64036245, + "step": 2978, + "time_per_iteration": 2.8476922512054443 + }, + { + "auxiliary_loss_clip": 0.01117287, + "auxiliary_loss_mlp": 0.01086257, + "balance_loss_clip": 1.03431547, + "balance_loss_mlp": 1.00529003, + "epoch": 0.358203571213852, + "flos": 13953127628160.0, + "grad_norm": 3.024334815181296, + "language_loss": 0.75334769, + "learning_rate": 2.9726684920335353e-06, + "loss": 0.77538317, + "num_input_tokens_seen": 64054110, + "step": 2979, + "time_per_iteration": 2.805083751678467 + }, + { + "auxiliary_loss_clip": 0.01150305, + "auxiliary_loss_mlp": 0.00873605, + "balance_loss_clip": 1.03491449, + "balance_loss_mlp": 1.000278, + "epoch": 0.35832381410449105, + "flos": 20302457172480.0, + "grad_norm": 2.1871963469002327, + "language_loss": 0.8200748, + "learning_rate": 2.971987773994432e-06, + "loss": 0.84031391, + "num_input_tokens_seen": 64070295, + "step": 2980, + "time_per_iteration": 2.695122003555298 + }, + { + "auxiliary_loss_clip": 0.01141782, + "auxiliary_loss_mlp": 0.01087689, + "balance_loss_clip": 1.03494442, + "balance_loss_mlp": 1.00662661, + "epoch": 0.35844405699513016, + "flos": 16983234115200.0, + "grad_norm": 1.8389152016886452, + "language_loss": 0.83253908, + "learning_rate": 2.9713069085055566e-06, + "loss": 0.85483372, + "num_input_tokens_seen": 64088605, + "step": 2981, + "time_per_iteration": 2.712095022201538 + }, + { + "auxiliary_loss_clip": 0.01119188, + "auxiliary_loss_mlp": 0.01087453, + "balance_loss_clip": 1.03124452, + "balance_loss_mlp": 1.00639129, + "epoch": 0.35856429988576927, + "flos": 23216858974080.0, + "grad_norm": 2.4651197326691303, + "language_loss": 0.78995132, + "learning_rate": 2.9706258956701958e-06, + "loss": 0.81201774, + "num_input_tokens_seen": 64108595, + "step": 2982, + "time_per_iteration": 2.7461795806884766 + }, + { + "auxiliary_loss_clip": 0.01142158, + "auxiliary_loss_mlp": 0.01086454, + "balance_loss_clip": 1.03594613, + "balance_loss_mlp": 1.00534368, + "epoch": 0.3586845427764083, + "flos": 23034576430080.0, + "grad_norm": 3.002317362951063, + "language_loss": 0.77177918, + "learning_rate": 2.9699447355916575e-06, + "loss": 0.7940653, + "num_input_tokens_seen": 64127405, + "step": 2983, + "time_per_iteration": 2.7662723064422607 + }, + { + "auxiliary_loss_clip": 0.01151896, + "auxiliary_loss_mlp": 0.00873495, + "balance_loss_clip": 1.03740144, + "balance_loss_mlp": 1.00021434, + "epoch": 0.35880478566704743, + "flos": 20010682995840.0, + "grad_norm": 2.0716850951625125, + "language_loss": 0.73856127, + "learning_rate": 2.969263428373275e-06, + "loss": 0.75881523, + "num_input_tokens_seen": 64145755, + "step": 2984, + "time_per_iteration": 2.6620101928710938 + }, + { + "auxiliary_loss_clip": 0.01133216, + "auxiliary_loss_mlp": 0.01086578, + "balance_loss_clip": 1.03571641, + "balance_loss_mlp": 1.00561142, + "epoch": 0.35892502855768654, + "flos": 13699095667200.0, + "grad_norm": 2.409328800041833, + "language_loss": 0.78735441, + "learning_rate": 2.9685819741184007e-06, + "loss": 0.80955231, + "num_input_tokens_seen": 64164195, + "step": 2985, + "time_per_iteration": 2.7740437984466553 + }, + { + "auxiliary_loss_clip": 0.01119215, + "auxiliary_loss_mlp": 0.01085984, + "balance_loss_clip": 1.0316534, + "balance_loss_mlp": 1.00515974, + "epoch": 0.3590452714483256, + "flos": 18114096977280.0, + "grad_norm": 2.5494460011174604, + "language_loss": 0.68374085, + "learning_rate": 2.967900372930411e-06, + "loss": 0.70579284, + "num_input_tokens_seen": 64182705, + "step": 2986, + "time_per_iteration": 2.8753228187561035 + }, + { + "auxiliary_loss_clip": 0.01132204, + "auxiliary_loss_mlp": 0.01088155, + "balance_loss_clip": 1.03358579, + "balance_loss_mlp": 1.00704539, + "epoch": 0.3591655143389647, + "flos": 17749352321280.0, + "grad_norm": 2.563046740793305, + "language_loss": 0.78787577, + "learning_rate": 2.9672186249127046e-06, + "loss": 0.8100794, + "num_input_tokens_seen": 64202170, + "step": 2987, + "time_per_iteration": 3.7076196670532227 + }, + { + "auxiliary_loss_clip": 0.01134215, + "auxiliary_loss_mlp": 0.01086509, + "balance_loss_clip": 1.03720617, + "balance_loss_mlp": 1.00563741, + "epoch": 0.3592857572296038, + "flos": 25224409082880.0, + "grad_norm": 1.9461650103532366, + "language_loss": 0.78957283, + "learning_rate": 2.9665367301687014e-06, + "loss": 0.8117801, + "num_input_tokens_seen": 64220415, + "step": 2988, + "time_per_iteration": 2.810676336288452 + }, + { + "auxiliary_loss_clip": 0.01132896, + "auxiliary_loss_mlp": 0.01087359, + "balance_loss_clip": 1.03435302, + "balance_loss_mlp": 1.00610566, + "epoch": 0.3594060001202429, + "flos": 29384408764800.0, + "grad_norm": 2.22817475586196, + "language_loss": 0.76728761, + "learning_rate": 2.965854688801845e-06, + "loss": 0.7894901, + "num_input_tokens_seen": 64242475, + "step": 2989, + "time_per_iteration": 2.8395116329193115 + }, + { + "auxiliary_loss_clip": 0.01141142, + "auxiliary_loss_mlp": 0.0108552, + "balance_loss_clip": 1.03455806, + "balance_loss_mlp": 1.00450492, + "epoch": 0.359526243010882, + "flos": 17052900543360.0, + "grad_norm": 2.317298604859806, + "language_loss": 0.76516145, + "learning_rate": 2.9651725009156005e-06, + "loss": 0.78742802, + "num_input_tokens_seen": 64260220, + "step": 2990, + "time_per_iteration": 2.757683277130127 + }, + { + "auxiliary_loss_clip": 0.01131042, + "auxiliary_loss_mlp": 0.01087582, + "balance_loss_clip": 1.03406429, + "balance_loss_mlp": 1.00661469, + "epoch": 0.3596464859015211, + "flos": 22965089569920.0, + "grad_norm": 1.8146171291878834, + "language_loss": 0.74443632, + "learning_rate": 2.964490166613454e-06, + "loss": 0.76662254, + "num_input_tokens_seen": 64280145, + "step": 2991, + "time_per_iteration": 3.7228665351867676 + }, + { + "auxiliary_loss_clip": 0.01151585, + "auxiliary_loss_mlp": 0.01079809, + "balance_loss_clip": 1.05174625, + "balance_loss_mlp": 1.00046301, + "epoch": 0.35976672879216015, + "flos": 54739462590720.0, + "grad_norm": 0.7635783604962886, + "language_loss": 0.57753122, + "learning_rate": 2.963807685998917e-06, + "loss": 0.59984517, + "num_input_tokens_seen": 64336010, + "step": 2992, + "time_per_iteration": 3.0630781650543213 + }, + { + "auxiliary_loss_clip": 0.01113547, + "auxiliary_loss_mlp": 0.01087715, + "balance_loss_clip": 1.03301203, + "balance_loss_mlp": 1.00674796, + "epoch": 0.35988697168279926, + "flos": 43139020901760.0, + "grad_norm": 1.4551529655408435, + "language_loss": 0.77945894, + "learning_rate": 2.9631250591755196e-06, + "loss": 0.80147159, + "num_input_tokens_seen": 64358725, + "step": 2993, + "time_per_iteration": 4.0270655155181885 + }, + { + "auxiliary_loss_clip": 0.01123364, + "auxiliary_loss_mlp": 0.01087198, + "balance_loss_clip": 1.03338671, + "balance_loss_mlp": 1.00618303, + "epoch": 0.36000721457343837, + "flos": 35845600239360.0, + "grad_norm": 1.9402711100963759, + "language_loss": 0.57472861, + "learning_rate": 2.962442286246817e-06, + "loss": 0.59683418, + "num_input_tokens_seen": 64381555, + "step": 2994, + "time_per_iteration": 2.9563372135162354 + }, + { + "auxiliary_loss_clip": 0.01117478, + "auxiliary_loss_mlp": 0.01086487, + "balance_loss_clip": 1.0349133, + "balance_loss_mlp": 1.00551987, + "epoch": 0.3601274574640774, + "flos": 18291100222080.0, + "grad_norm": 1.8902874668556073, + "language_loss": 0.69913942, + "learning_rate": 2.9617593673163853e-06, + "loss": 0.72117901, + "num_input_tokens_seen": 64400375, + "step": 2995, + "time_per_iteration": 3.719071388244629 + }, + { + "auxiliary_loss_clip": 0.01133521, + "auxiliary_loss_mlp": 0.01086689, + "balance_loss_clip": 1.03512502, + "balance_loss_mlp": 1.00572157, + "epoch": 0.36024770035471654, + "flos": 13333955961600.0, + "grad_norm": 2.067299133741662, + "language_loss": 0.77470088, + "learning_rate": 2.9610763024878216e-06, + "loss": 0.79690295, + "num_input_tokens_seen": 64415880, + "step": 2996, + "time_per_iteration": 2.761415481567383 + }, + { + "auxiliary_loss_clip": 0.01133572, + "auxiliary_loss_mlp": 0.01086964, + "balance_loss_clip": 1.03519654, + "balance_loss_mlp": 1.00613964, + "epoch": 0.3603679432453556, + "flos": 20267013427200.0, + "grad_norm": 3.651519251290324, + "language_loss": 0.91232109, + "learning_rate": 2.960393091864747e-06, + "loss": 0.93452644, + "num_input_tokens_seen": 64434260, + "step": 2997, + "time_per_iteration": 2.833611011505127 + }, + { + "auxiliary_loss_clip": 0.01132181, + "auxiliary_loss_mlp": 0.01086255, + "balance_loss_clip": 1.03529537, + "balance_loss_mlp": 1.00533533, + "epoch": 0.3604881861359947, + "flos": 22451135817600.0, + "grad_norm": 3.169415744137096, + "language_loss": 0.75154674, + "learning_rate": 2.959709735550804e-06, + "loss": 0.77373111, + "num_input_tokens_seen": 64453855, + "step": 2998, + "time_per_iteration": 2.8196229934692383 + }, + { + "auxiliary_loss_clip": 0.01113212, + "auxiliary_loss_mlp": 0.01085728, + "balance_loss_clip": 1.03293681, + "balance_loss_mlp": 1.00476098, + "epoch": 0.3606084290266338, + "flos": 22054251467520.0, + "grad_norm": 2.2046373928529417, + "language_loss": 0.75630754, + "learning_rate": 2.9590262336496575e-06, + "loss": 0.77829695, + "num_input_tokens_seen": 64473585, + "step": 2999, + "time_per_iteration": 2.9093151092529297 + }, + { + "auxiliary_loss_clip": 0.01116936, + "auxiliary_loss_mlp": 0.01085532, + "balance_loss_clip": 1.03014636, + "balance_loss_mlp": 1.00456488, + "epoch": 0.36072867191727287, + "flos": 15632921111040.0, + "grad_norm": 2.0784858784517564, + "language_loss": 0.85433567, + "learning_rate": 2.9583425862649936e-06, + "loss": 0.8763603, + "num_input_tokens_seen": 64491720, + "step": 3000, + "time_per_iteration": 2.902980089187622 + }, + { + "auxiliary_loss_clip": 0.01153278, + "auxiliary_loss_mlp": 0.01087074, + "balance_loss_clip": 1.03814971, + "balance_loss_mlp": 1.00591636, + "epoch": 0.360848914807912, + "flos": 19677000625920.0, + "grad_norm": 2.2085222212400697, + "language_loss": 0.74345136, + "learning_rate": 2.9576587935005215e-06, + "loss": 0.7658549, + "num_input_tokens_seen": 64509800, + "step": 3001, + "time_per_iteration": 2.77618408203125 + }, + { + "auxiliary_loss_clip": 0.01141876, + "auxiliary_loss_mlp": 0.01086198, + "balance_loss_clip": 1.03543425, + "balance_loss_mlp": 1.00508845, + "epoch": 0.3609691576985511, + "flos": 18877808972160.0, + "grad_norm": 2.354005744973407, + "language_loss": 0.72008675, + "learning_rate": 2.9569748554599713e-06, + "loss": 0.74236751, + "num_input_tokens_seen": 64525410, + "step": 3002, + "time_per_iteration": 2.7490556240081787 + }, + { + "auxiliary_loss_clip": 0.01131125, + "auxiliary_loss_mlp": 0.01088704, + "balance_loss_clip": 1.03429008, + "balance_loss_mlp": 1.00778484, + "epoch": 0.36108940058919015, + "flos": 42224088648960.0, + "grad_norm": 2.1083410314693154, + "language_loss": 0.73037559, + "learning_rate": 2.956290772247097e-06, + "loss": 0.75257385, + "num_input_tokens_seen": 64544085, + "step": 3003, + "time_per_iteration": 2.922218084335327 + }, + { + "auxiliary_loss_clip": 0.01106535, + "auxiliary_loss_mlp": 0.010877, + "balance_loss_clip": 1.03347409, + "balance_loss_mlp": 1.00668514, + "epoch": 0.36120964347982926, + "flos": 23185150243200.0, + "grad_norm": 1.6396454996173917, + "language_loss": 0.73377407, + "learning_rate": 2.9556065439656724e-06, + "loss": 0.75571638, + "num_input_tokens_seen": 64563135, + "step": 3004, + "time_per_iteration": 3.0139901638031006 + }, + { + "auxiliary_loss_clip": 0.01101582, + "auxiliary_loss_mlp": 0.01086666, + "balance_loss_clip": 1.03080237, + "balance_loss_mlp": 1.0057472, + "epoch": 0.36132988637046837, + "flos": 18113055482880.0, + "grad_norm": 4.165227661318742, + "language_loss": 0.81546116, + "learning_rate": 2.9549221707194952e-06, + "loss": 0.83734363, + "num_input_tokens_seen": 64581985, + "step": 3005, + "time_per_iteration": 2.889950752258301 + }, + { + "auxiliary_loss_clip": 0.01142372, + "auxiliary_loss_mlp": 0.01085434, + "balance_loss_clip": 1.03644705, + "balance_loss_mlp": 1.00437188, + "epoch": 0.3614501292611074, + "flos": 27813101333760.0, + "grad_norm": 1.974231523783446, + "language_loss": 0.72406113, + "learning_rate": 2.954237652612384e-06, + "loss": 0.7463392, + "num_input_tokens_seen": 64601035, + "step": 3006, + "time_per_iteration": 2.8442084789276123 + }, + { + "auxiliary_loss_clip": 0.01130889, + "auxiliary_loss_mlp": 0.01085593, + "balance_loss_clip": 1.03450561, + "balance_loss_mlp": 1.00467336, + "epoch": 0.36157037215174653, + "flos": 22634926732800.0, + "grad_norm": 2.642668418653884, + "language_loss": 0.8417542, + "learning_rate": 2.9535529897481796e-06, + "loss": 0.86391902, + "num_input_tokens_seen": 64618580, + "step": 3007, + "time_per_iteration": 2.7661194801330566 + }, + { + "auxiliary_loss_clip": 0.01150745, + "auxiliary_loss_mlp": 0.01087077, + "balance_loss_clip": 1.03572881, + "balance_loss_mlp": 1.00606203, + "epoch": 0.36169061504238564, + "flos": 12600839376000.0, + "grad_norm": 2.138781524491859, + "language_loss": 0.7694366, + "learning_rate": 2.9528681822307446e-06, + "loss": 0.7918148, + "num_input_tokens_seen": 64635430, + "step": 3008, + "time_per_iteration": 2.793625593185425 + }, + { + "auxiliary_loss_clip": 0.0113852, + "auxiliary_loss_mlp": 0.00873422, + "balance_loss_clip": 1.03317702, + "balance_loss_mlp": 1.00030947, + "epoch": 0.3618108579330247, + "flos": 26684644682880.0, + "grad_norm": 1.9744753448514523, + "language_loss": 0.8250941, + "learning_rate": 2.952183230163964e-06, + "loss": 0.84521347, + "num_input_tokens_seen": 64655005, + "step": 3009, + "time_per_iteration": 2.7606360912323 + }, + { + "auxiliary_loss_clip": 0.01120955, + "auxiliary_loss_mlp": 0.01085601, + "balance_loss_clip": 1.03249872, + "balance_loss_mlp": 1.00472951, + "epoch": 0.3619311008236638, + "flos": 22817029708800.0, + "grad_norm": 2.100096218010081, + "language_loss": 0.72844076, + "learning_rate": 2.9514981336517448e-06, + "loss": 0.75050628, + "num_input_tokens_seen": 64674775, + "step": 3010, + "time_per_iteration": 2.810572624206543 + }, + { + "auxiliary_loss_clip": 0.01139192, + "auxiliary_loss_mlp": 0.01086343, + "balance_loss_clip": 1.03455329, + "balance_loss_mlp": 1.00537574, + "epoch": 0.36205134371430286, + "flos": 25919603884800.0, + "grad_norm": 2.07812406389795, + "language_loss": 0.81037956, + "learning_rate": 2.950812892798015e-06, + "loss": 0.83263493, + "num_input_tokens_seen": 64695670, + "step": 3011, + "time_per_iteration": 2.7660679817199707 + }, + { + "auxiliary_loss_clip": 0.0111282, + "auxiliary_loss_mlp": 0.00873393, + "balance_loss_clip": 1.03419018, + "balance_loss_mlp": 1.0003233, + "epoch": 0.362171586604942, + "flos": 26139592730880.0, + "grad_norm": 2.173908475037765, + "language_loss": 0.87330538, + "learning_rate": 2.9501275077067256e-06, + "loss": 0.8931675, + "num_input_tokens_seen": 64716290, + "step": 3012, + "time_per_iteration": 2.8257620334625244 + }, + { + "auxiliary_loss_clip": 0.01105569, + "auxiliary_loss_mlp": 0.01086958, + "balance_loss_clip": 1.03400588, + "balance_loss_mlp": 1.00613427, + "epoch": 0.3622918294955811, + "flos": 28074208273920.0, + "grad_norm": 1.5593930125088329, + "language_loss": 0.88627273, + "learning_rate": 2.949441978481848e-06, + "loss": 0.908198, + "num_input_tokens_seen": 64737190, + "step": 3013, + "time_per_iteration": 3.8176798820495605 + }, + { + "auxiliary_loss_clip": 0.01107402, + "auxiliary_loss_mlp": 0.01088195, + "balance_loss_clip": 1.03301454, + "balance_loss_mlp": 1.00708508, + "epoch": 0.36241207238622014, + "flos": 19828005402240.0, + "grad_norm": 3.355484159497436, + "language_loss": 0.80148542, + "learning_rate": 2.9487563052273778e-06, + "loss": 0.82344139, + "num_input_tokens_seen": 64753950, + "step": 3014, + "time_per_iteration": 2.7485299110412598 + }, + { + "auxiliary_loss_clip": 0.01133485, + "auxiliary_loss_mlp": 0.01086966, + "balance_loss_clip": 1.03067183, + "balance_loss_mlp": 1.00595164, + "epoch": 0.36253231527685925, + "flos": 21397158017280.0, + "grad_norm": 1.8055065859059098, + "language_loss": 0.85727191, + "learning_rate": 2.94807048804733e-06, + "loss": 0.87947643, + "num_input_tokens_seen": 64773570, + "step": 3015, + "time_per_iteration": 2.767606258392334 + }, + { + "auxiliary_loss_clip": 0.01111603, + "auxiliary_loss_mlp": 0.0108658, + "balance_loss_clip": 1.03596556, + "balance_loss_mlp": 1.00561261, + "epoch": 0.36265255816749836, + "flos": 18362885552640.0, + "grad_norm": 2.013566764635145, + "language_loss": 0.90252924, + "learning_rate": 2.9473845270457434e-06, + "loss": 0.92451113, + "num_input_tokens_seen": 64790385, + "step": 3016, + "time_per_iteration": 2.8029003143310547 + }, + { + "auxiliary_loss_clip": 0.01128644, + "auxiliary_loss_mlp": 0.01086604, + "balance_loss_clip": 1.03195965, + "balance_loss_mlp": 1.00568521, + "epoch": 0.3627728010581374, + "flos": 18660046769280.0, + "grad_norm": 2.4967238186768324, + "language_loss": 0.69683349, + "learning_rate": 2.946698422326677e-06, + "loss": 0.71898603, + "num_input_tokens_seen": 64807845, + "step": 3017, + "time_per_iteration": 3.6549713611602783 + }, + { + "auxiliary_loss_clip": 0.01103043, + "auxiliary_loss_mlp": 0.01086614, + "balance_loss_clip": 1.02932644, + "balance_loss_mlp": 1.0056473, + "epoch": 0.36289304394877653, + "flos": 27524272072320.0, + "grad_norm": 5.32602362912112, + "language_loss": 0.79566944, + "learning_rate": 2.946012173994213e-06, + "loss": 0.81756604, + "num_input_tokens_seen": 64827630, + "step": 3018, + "time_per_iteration": 2.9100074768066406 + }, + { + "auxiliary_loss_clip": 0.01140031, + "auxiliary_loss_mlp": 0.01086475, + "balance_loss_clip": 1.03663921, + "balance_loss_mlp": 1.00555611, + "epoch": 0.36301328683941564, + "flos": 34533244932480.0, + "grad_norm": 1.4200337667723644, + "language_loss": 0.67568475, + "learning_rate": 2.945325782152454e-06, + "loss": 0.69794977, + "num_input_tokens_seen": 64850665, + "step": 3019, + "time_per_iteration": 3.819129228591919 + }, + { + "auxiliary_loss_clip": 0.01134565, + "auxiliary_loss_mlp": 0.01086526, + "balance_loss_clip": 1.03660059, + "balance_loss_mlp": 1.00565422, + "epoch": 0.3631335297300547, + "flos": 19025976574080.0, + "grad_norm": 2.1404460097262823, + "language_loss": 0.79126298, + "learning_rate": 2.9446392469055257e-06, + "loss": 0.81347388, + "num_input_tokens_seen": 64868700, + "step": 3020, + "time_per_iteration": 2.6986336708068848 + }, + { + "auxiliary_loss_clip": 0.0111521, + "auxiliary_loss_mlp": 0.0108631, + "balance_loss_clip": 1.03445578, + "balance_loss_mlp": 1.00543809, + "epoch": 0.3632537726206938, + "flos": 19536769929600.0, + "grad_norm": 2.1116547922296256, + "language_loss": 0.79714584, + "learning_rate": 2.9439525683575745e-06, + "loss": 0.81916106, + "num_input_tokens_seen": 64887620, + "step": 3021, + "time_per_iteration": 3.8045804500579834 + }, + { + "auxiliary_loss_clip": 0.01152438, + "auxiliary_loss_mlp": 0.0108738, + "balance_loss_clip": 1.03747129, + "balance_loss_mlp": 1.00636578, + "epoch": 0.3633740155113329, + "flos": 21068611292160.0, + "grad_norm": 2.1456092344391124, + "language_loss": 0.75065136, + "learning_rate": 2.9432657466127694e-06, + "loss": 0.77304959, + "num_input_tokens_seen": 64907190, + "step": 3022, + "time_per_iteration": 2.8329062461853027 + }, + { + "auxiliary_loss_clip": 0.01111231, + "auxiliary_loss_mlp": 0.01087616, + "balance_loss_clip": 1.03386068, + "balance_loss_mlp": 1.00664926, + "epoch": 0.36349425840197197, + "flos": 20298722158080.0, + "grad_norm": 1.7210799075969236, + "language_loss": 0.76396716, + "learning_rate": 2.9425787817753007e-06, + "loss": 0.78595561, + "num_input_tokens_seen": 64925850, + "step": 3023, + "time_per_iteration": 3.012413740158081 + }, + { + "auxiliary_loss_clip": 0.01104032, + "auxiliary_loss_mlp": 0.01086058, + "balance_loss_clip": 1.03146529, + "balance_loss_mlp": 1.00504327, + "epoch": 0.3636145012926111, + "flos": 29716762331520.0, + "grad_norm": 1.8756458422003814, + "language_loss": 0.71465337, + "learning_rate": 2.94189167394938e-06, + "loss": 0.73655427, + "num_input_tokens_seen": 64948285, + "step": 3024, + "time_per_iteration": 3.013775110244751 + }, + { + "auxiliary_loss_clip": 0.01151728, + "auxiliary_loss_mlp": 0.01086569, + "balance_loss_clip": 1.03749156, + "balance_loss_mlp": 1.00569773, + "epoch": 0.3637347441832502, + "flos": 21431847576960.0, + "grad_norm": 1.6163725315418436, + "language_loss": 0.80861777, + "learning_rate": 2.941204423239241e-06, + "loss": 0.8310008, + "num_input_tokens_seen": 64967160, + "step": 3025, + "time_per_iteration": 2.6511285305023193 + }, + { + "auxiliary_loss_clip": 0.01141233, + "auxiliary_loss_mlp": 0.01087002, + "balance_loss_clip": 1.03669786, + "balance_loss_mlp": 1.00608301, + "epoch": 0.36385498707388925, + "flos": 29533941083520.0, + "grad_norm": 3.255130441482461, + "language_loss": 0.75886655, + "learning_rate": 2.9405170297491395e-06, + "loss": 0.78114897, + "num_input_tokens_seen": 64987155, + "step": 3026, + "time_per_iteration": 2.808070182800293 + }, + { + "auxiliary_loss_clip": 0.01082145, + "auxiliary_loss_mlp": 0.00873519, + "balance_loss_clip": 1.0230037, + "balance_loss_mlp": 1.00029576, + "epoch": 0.36397522996452836, + "flos": 22236569925120.0, + "grad_norm": 1.800405359529903, + "language_loss": 0.80759519, + "learning_rate": 2.939829493583353e-06, + "loss": 0.82715178, + "num_input_tokens_seen": 65003800, + "step": 3027, + "time_per_iteration": 2.8439674377441406 + }, + { + "auxiliary_loss_clip": 0.01124901, + "auxiliary_loss_mlp": 0.010866, + "balance_loss_clip": 1.03531885, + "balance_loss_mlp": 1.00568116, + "epoch": 0.3640954728551674, + "flos": 21506505995520.0, + "grad_norm": 2.2951646770671825, + "language_loss": 0.83305991, + "learning_rate": 2.939141814846179e-06, + "loss": 0.8551749, + "num_input_tokens_seen": 65021215, + "step": 3028, + "time_per_iteration": 2.806978702545166 + }, + { + "auxiliary_loss_clip": 0.01131264, + "auxiliary_loss_mlp": 0.0108724, + "balance_loss_clip": 1.03417146, + "balance_loss_mlp": 1.00627327, + "epoch": 0.3642157157458065, + "flos": 17712867081600.0, + "grad_norm": 1.8788750277169857, + "language_loss": 0.82606608, + "learning_rate": 2.938453993641938e-06, + "loss": 0.8482511, + "num_input_tokens_seen": 65039590, + "step": 3029, + "time_per_iteration": 2.8283944129943848 + }, + { + "auxiliary_loss_clip": 0.01131194, + "auxiliary_loss_mlp": 0.01086453, + "balance_loss_clip": 1.0366869, + "balance_loss_mlp": 1.00539029, + "epoch": 0.36433595863644563, + "flos": 17639537466240.0, + "grad_norm": 2.264931938673904, + "language_loss": 0.70379853, + "learning_rate": 2.937766030074973e-06, + "loss": 0.72597498, + "num_input_tokens_seen": 65056845, + "step": 3030, + "time_per_iteration": 2.7526519298553467 + }, + { + "auxiliary_loss_clip": 0.01107249, + "auxiliary_loss_mlp": 0.01086493, + "balance_loss_clip": 1.03392887, + "balance_loss_mlp": 1.00552607, + "epoch": 0.3644562015270847, + "flos": 26833279161600.0, + "grad_norm": 1.8084625021650873, + "language_loss": 0.82423496, + "learning_rate": 2.937077924249646e-06, + "loss": 0.84617239, + "num_input_tokens_seen": 65079435, + "step": 3031, + "time_per_iteration": 2.8603341579437256 + }, + { + "auxiliary_loss_clip": 0.01119568, + "auxiliary_loss_mlp": 0.01085381, + "balance_loss_clip": 1.03588939, + "balance_loss_mlp": 1.00446177, + "epoch": 0.3645764444177238, + "flos": 14282715847680.0, + "grad_norm": 2.1780791365484555, + "language_loss": 0.75397277, + "learning_rate": 2.9363896762703443e-06, + "loss": 0.7760222, + "num_input_tokens_seen": 65096500, + "step": 3032, + "time_per_iteration": 2.7507996559143066 + }, + { + "auxiliary_loss_clip": 0.01150641, + "auxiliary_loss_mlp": 0.0108763, + "balance_loss_clip": 1.03623414, + "balance_loss_mlp": 1.00661516, + "epoch": 0.3646966873083629, + "flos": 20667489137280.0, + "grad_norm": 1.6256572219923797, + "language_loss": 0.84273612, + "learning_rate": 2.9357012862414725e-06, + "loss": 0.86511886, + "num_input_tokens_seen": 65115860, + "step": 3033, + "time_per_iteration": 2.7909109592437744 + }, + { + "auxiliary_loss_clip": 0.01141399, + "auxiliary_loss_mlp": 0.01085585, + "balance_loss_clip": 1.03553307, + "balance_loss_mlp": 1.00471377, + "epoch": 0.36481693019900197, + "flos": 27782613665280.0, + "grad_norm": 1.8295825105806023, + "language_loss": 0.71847463, + "learning_rate": 2.9350127542674593e-06, + "loss": 0.74074447, + "num_input_tokens_seen": 65138070, + "step": 3034, + "time_per_iteration": 2.7424354553222656 + }, + { + "auxiliary_loss_clip": 0.0111923, + "auxiliary_loss_mlp": 0.01088577, + "balance_loss_clip": 1.03667259, + "balance_loss_mlp": 1.00756264, + "epoch": 0.3649371730896411, + "flos": 19712588025600.0, + "grad_norm": 1.9512674625393513, + "language_loss": 0.76761758, + "learning_rate": 2.934324080452755e-06, + "loss": 0.78969562, + "num_input_tokens_seen": 65155860, + "step": 3035, + "time_per_iteration": 2.7717623710632324 + }, + { + "auxiliary_loss_clip": 0.0112354, + "auxiliary_loss_mlp": 0.00873554, + "balance_loss_clip": 1.03460956, + "balance_loss_mlp": 1.00021577, + "epoch": 0.3650574159802802, + "flos": 24750496016640.0, + "grad_norm": 1.4900432079012698, + "language_loss": 0.78283787, + "learning_rate": 2.9336352649018307e-06, + "loss": 0.80280882, + "num_input_tokens_seen": 65175930, + "step": 3036, + "time_per_iteration": 2.859773874282837 + }, + { + "auxiliary_loss_clip": 0.01133955, + "auxiliary_loss_mlp": 0.01087143, + "balance_loss_clip": 1.03687775, + "balance_loss_mlp": 1.00612831, + "epoch": 0.36517765887091924, + "flos": 32853487363200.0, + "grad_norm": 2.0502913386150223, + "language_loss": 0.70182633, + "learning_rate": 2.9329463077191783e-06, + "loss": 0.72403729, + "num_input_tokens_seen": 65199305, + "step": 3037, + "time_per_iteration": 2.876706838607788 + }, + { + "auxiliary_loss_clip": 0.01116015, + "auxiliary_loss_mlp": 0.0108798, + "balance_loss_clip": 1.03505385, + "balance_loss_mlp": 1.00686955, + "epoch": 0.36529790176155835, + "flos": 20120318282880.0, + "grad_norm": 2.723101326578173, + "language_loss": 0.64325875, + "learning_rate": 2.9322572090093135e-06, + "loss": 0.6652987, + "num_input_tokens_seen": 65218010, + "step": 3038, + "time_per_iteration": 3.7620391845703125 + }, + { + "auxiliary_loss_clip": 0.01114574, + "auxiliary_loss_mlp": 0.01086513, + "balance_loss_clip": 1.03324974, + "balance_loss_mlp": 1.00549817, + "epoch": 0.36541814465219746, + "flos": 17639573379840.0, + "grad_norm": 3.0386147935034575, + "language_loss": 0.76492292, + "learning_rate": 2.9315679688767713e-06, + "loss": 0.78693378, + "num_input_tokens_seen": 65236020, + "step": 3039, + "time_per_iteration": 2.7228662967681885 + }, + { + "auxiliary_loss_clip": 0.01134363, + "auxiliary_loss_mlp": 0.0108684, + "balance_loss_clip": 1.03627217, + "balance_loss_mlp": 1.00601625, + "epoch": 0.3655383875428365, + "flos": 22674356887680.0, + "grad_norm": 1.4784695344991705, + "language_loss": 0.66537857, + "learning_rate": 2.9308785874261085e-06, + "loss": 0.68759072, + "num_input_tokens_seen": 65256210, + "step": 3040, + "time_per_iteration": 2.7855005264282227 + }, + { + "auxiliary_loss_clip": 0.01152308, + "auxiliary_loss_mlp": 0.01086851, + "balance_loss_clip": 1.03792906, + "balance_loss_mlp": 1.00583673, + "epoch": 0.36565863043347563, + "flos": 21981173247360.0, + "grad_norm": 2.587730135837609, + "language_loss": 0.81632984, + "learning_rate": 2.9301890647619045e-06, + "loss": 0.83872139, + "num_input_tokens_seen": 65275505, + "step": 3041, + "time_per_iteration": 3.7000865936279297 + }, + { + "auxiliary_loss_clip": 0.01119331, + "auxiliary_loss_mlp": 0.01087805, + "balance_loss_clip": 1.03678846, + "balance_loss_mlp": 1.00669491, + "epoch": 0.36577887332411474, + "flos": 24827632473600.0, + "grad_norm": 2.1610197383438217, + "language_loss": 0.8003608, + "learning_rate": 2.929499400988759e-06, + "loss": 0.82243216, + "num_input_tokens_seen": 65296665, + "step": 3042, + "time_per_iteration": 2.7749838829040527 + }, + { + "auxiliary_loss_clip": 0.01137034, + "auxiliary_loss_mlp": 0.01086782, + "balance_loss_clip": 1.03178155, + "balance_loss_mlp": 1.00567198, + "epoch": 0.3658991162147538, + "flos": 28293191539200.0, + "grad_norm": 1.5775589837961657, + "language_loss": 0.65123224, + "learning_rate": 2.9288095962112927e-06, + "loss": 0.67347038, + "num_input_tokens_seen": 65317370, + "step": 3043, + "time_per_iteration": 2.779498338699341 + }, + { + "auxiliary_loss_clip": 0.01151028, + "auxiliary_loss_mlp": 0.0108624, + "balance_loss_clip": 1.03724885, + "balance_loss_mlp": 1.00522542, + "epoch": 0.3660193591053929, + "flos": 17785550252160.0, + "grad_norm": 2.3781896358483587, + "language_loss": 0.85711223, + "learning_rate": 2.9281196505341503e-06, + "loss": 0.87948489, + "num_input_tokens_seen": 65334540, + "step": 3044, + "time_per_iteration": 2.6351194381713867 + }, + { + "auxiliary_loss_clip": 0.0110227, + "auxiliary_loss_mlp": 0.00873533, + "balance_loss_clip": 1.03066993, + "balance_loss_mlp": 1.00030351, + "epoch": 0.36613960199603196, + "flos": 10342776839040.0, + "grad_norm": 1.9082054590135846, + "language_loss": 0.78206491, + "learning_rate": 2.9274295640619946e-06, + "loss": 0.8018229, + "num_input_tokens_seen": 65351670, + "step": 3045, + "time_per_iteration": 3.766383171081543 + }, + { + "auxiliary_loss_clip": 0.01124478, + "auxiliary_loss_mlp": 0.01087262, + "balance_loss_clip": 1.03556013, + "balance_loss_mlp": 1.00639033, + "epoch": 0.36625984488667107, + "flos": 19755609540480.0, + "grad_norm": 1.8926944285312677, + "language_loss": 0.78208512, + "learning_rate": 2.9267393368995103e-06, + "loss": 0.80420256, + "num_input_tokens_seen": 65370900, + "step": 3046, + "time_per_iteration": 3.740819215774536 + }, + { + "auxiliary_loss_clip": 0.01151378, + "auxiliary_loss_mlp": 0.0108626, + "balance_loss_clip": 1.03701878, + "balance_loss_mlp": 1.00538874, + "epoch": 0.3663800877773102, + "flos": 17674262939520.0, + "grad_norm": 2.8912164879035704, + "language_loss": 0.74280691, + "learning_rate": 2.926048969151407e-06, + "loss": 0.76518327, + "num_input_tokens_seen": 65388185, + "step": 3047, + "time_per_iteration": 2.5901217460632324 + }, + { + "auxiliary_loss_clip": 0.01107024, + "auxiliary_loss_mlp": 0.0108552, + "balance_loss_clip": 1.03161037, + "balance_loss_mlp": 1.00455308, + "epoch": 0.36650033066794924, + "flos": 20303606407680.0, + "grad_norm": 1.8854294822017006, + "language_loss": 0.68483829, + "learning_rate": 2.92535846092241e-06, + "loss": 0.70676368, + "num_input_tokens_seen": 65407200, + "step": 3048, + "time_per_iteration": 2.8090977668762207 + }, + { + "auxiliary_loss_clip": 0.01129978, + "auxiliary_loss_mlp": 0.01087394, + "balance_loss_clip": 1.03323174, + "balance_loss_mlp": 1.0063796, + "epoch": 0.36662057355858835, + "flos": 24716237420160.0, + "grad_norm": 1.783744391477783, + "language_loss": 0.82455492, + "learning_rate": 2.9246678123172704e-06, + "loss": 0.84672862, + "num_input_tokens_seen": 65427290, + "step": 3049, + "time_per_iteration": 2.758577585220337 + }, + { + "auxiliary_loss_clip": 0.01150861, + "auxiliary_loss_mlp": 0.01085568, + "balance_loss_clip": 1.03641295, + "balance_loss_mlp": 1.00455368, + "epoch": 0.36674081644922746, + "flos": 12385267902720.0, + "grad_norm": 2.7631576685697268, + "language_loss": 0.74082589, + "learning_rate": 2.9239770234407596e-06, + "loss": 0.76319021, + "num_input_tokens_seen": 65445595, + "step": 3050, + "time_per_iteration": 2.6562747955322266 + }, + { + "auxiliary_loss_clip": 0.01141954, + "auxiliary_loss_mlp": 0.01085272, + "balance_loss_clip": 1.03639865, + "balance_loss_mlp": 1.00435305, + "epoch": 0.3668610593398665, + "flos": 21105922544640.0, + "grad_norm": 1.6270303987147734, + "language_loss": 0.68055451, + "learning_rate": 2.9232860943976686e-06, + "loss": 0.70282674, + "num_input_tokens_seen": 65466330, + "step": 3051, + "time_per_iteration": 2.734797716140747 + }, + { + "auxiliary_loss_clip": 0.01129697, + "auxiliary_loss_mlp": 0.01085272, + "balance_loss_clip": 1.03436017, + "balance_loss_mlp": 1.0044955, + "epoch": 0.3669813022305056, + "flos": 26758082039040.0, + "grad_norm": 2.2697084662268994, + "language_loss": 0.84099084, + "learning_rate": 2.9225950252928115e-06, + "loss": 0.86314046, + "num_input_tokens_seen": 65487180, + "step": 3052, + "time_per_iteration": 2.9162089824676514 + }, + { + "auxiliary_loss_clip": 0.01137341, + "auxiliary_loss_mlp": 0.01087509, + "balance_loss_clip": 1.03243816, + "balance_loss_mlp": 1.00649464, + "epoch": 0.36710154512114473, + "flos": 19099521671040.0, + "grad_norm": 2.589833256103505, + "language_loss": 0.82104343, + "learning_rate": 2.9219038162310217e-06, + "loss": 0.84329194, + "num_input_tokens_seen": 65505380, + "step": 3053, + "time_per_iteration": 2.783470869064331 + }, + { + "auxiliary_loss_clip": 0.01068568, + "auxiliary_loss_mlp": 0.00873644, + "balance_loss_clip": 1.02457345, + "balance_loss_mlp": 1.00033116, + "epoch": 0.3672217880117838, + "flos": 20812029465600.0, + "grad_norm": 2.2918022517410206, + "language_loss": 0.82304859, + "learning_rate": 2.921212467317157e-06, + "loss": 0.84247077, + "num_input_tokens_seen": 65524825, + "step": 3054, + "time_per_iteration": 2.9463038444519043 + }, + { + "auxiliary_loss_clip": 0.01132768, + "auxiliary_loss_mlp": 0.01085826, + "balance_loss_clip": 1.03479612, + "balance_loss_mlp": 1.00476384, + "epoch": 0.3673420309024229, + "flos": 13590394133760.0, + "grad_norm": 1.8987724290821892, + "language_loss": 0.79624367, + "learning_rate": 2.920520978656093e-06, + "loss": 0.81842959, + "num_input_tokens_seen": 65541790, + "step": 3055, + "time_per_iteration": 2.7908828258514404 + }, + { + "auxiliary_loss_clip": 0.01150762, + "auxiliary_loss_mlp": 0.00873449, + "balance_loss_clip": 1.03712177, + "balance_loss_mlp": 1.00033379, + "epoch": 0.367462273793062, + "flos": 28986877969920.0, + "grad_norm": 2.0962808905372734, + "language_loss": 0.76735795, + "learning_rate": 2.919829350352729e-06, + "loss": 0.78760004, + "num_input_tokens_seen": 65563395, + "step": 3056, + "time_per_iteration": 2.7757527828216553 + }, + { + "auxiliary_loss_clip": 0.01150822, + "auxiliary_loss_mlp": 0.01079612, + "balance_loss_clip": 1.05151224, + "balance_loss_mlp": 1.00026643, + "epoch": 0.36758251668370107, + "flos": 62643148346880.0, + "grad_norm": 0.7564670106042019, + "language_loss": 0.60021538, + "learning_rate": 2.919137582511983e-06, + "loss": 0.62251973, + "num_input_tokens_seen": 65619835, + "step": 3057, + "time_per_iteration": 3.2206082344055176 + }, + { + "auxiliary_loss_clip": 0.01116812, + "auxiliary_loss_mlp": 0.01086312, + "balance_loss_clip": 1.03017867, + "balance_loss_mlp": 1.00548768, + "epoch": 0.3677027595743402, + "flos": 12713886455040.0, + "grad_norm": 2.0865745216267064, + "language_loss": 0.63685262, + "learning_rate": 2.918445675238797e-06, + "loss": 0.65888393, + "num_input_tokens_seen": 65636760, + "step": 3058, + "time_per_iteration": 2.836479663848877 + }, + { + "auxiliary_loss_clip": 0.01149691, + "auxiliary_loss_mlp": 0.01085975, + "balance_loss_clip": 1.03536487, + "balance_loss_mlp": 1.0049603, + "epoch": 0.36782300246497923, + "flos": 25046579825280.0, + "grad_norm": 2.037145014600742, + "language_loss": 0.69826531, + "learning_rate": 2.917753628638132e-06, + "loss": 0.72062206, + "num_input_tokens_seen": 65657065, + "step": 3059, + "time_per_iteration": 2.7494890689849854 + }, + { + "auxiliary_loss_clip": 0.01131565, + "auxiliary_loss_mlp": 0.01085829, + "balance_loss_clip": 1.0352788, + "balance_loss_mlp": 1.00486255, + "epoch": 0.36794324535561834, + "flos": 17419512706560.0, + "grad_norm": 2.006339117917995, + "language_loss": 0.70070529, + "learning_rate": 2.9170614428149716e-06, + "loss": 0.72287929, + "num_input_tokens_seen": 65675400, + "step": 3060, + "time_per_iteration": 2.849663496017456 + }, + { + "auxiliary_loss_clip": 0.01123485, + "auxiliary_loss_mlp": 0.01086599, + "balance_loss_clip": 1.03620565, + "balance_loss_mlp": 1.00548911, + "epoch": 0.36806348824625745, + "flos": 24089128848000.0, + "grad_norm": 2.355075357627474, + "language_loss": 0.86756897, + "learning_rate": 2.9163691178743195e-06, + "loss": 0.88966978, + "num_input_tokens_seen": 65694050, + "step": 3061, + "time_per_iteration": 2.8376665115356445 + }, + { + "auxiliary_loss_clip": 0.01140373, + "auxiliary_loss_mlp": 0.01086336, + "balance_loss_clip": 1.03537464, + "balance_loss_mlp": 1.00536931, + "epoch": 0.3681837311368965, + "flos": 20521871400960.0, + "grad_norm": 1.927990318367736, + "language_loss": 0.77367496, + "learning_rate": 2.9156766539212006e-06, + "loss": 0.79594201, + "num_input_tokens_seen": 65711695, + "step": 3062, + "time_per_iteration": 2.781230926513672 + }, + { + "auxiliary_loss_clip": 0.01141335, + "auxiliary_loss_mlp": 0.01085202, + "balance_loss_clip": 1.03511262, + "balance_loss_mlp": 1.00428295, + "epoch": 0.3683039740275356, + "flos": 21466644877440.0, + "grad_norm": 2.1083096840966244, + "language_loss": 0.71651924, + "learning_rate": 2.9149840510606614e-06, + "loss": 0.73878467, + "num_input_tokens_seen": 65730350, + "step": 3063, + "time_per_iteration": 2.7361772060394287 + }, + { + "auxiliary_loss_clip": 0.01142281, + "auxiliary_loss_mlp": 0.00873355, + "balance_loss_clip": 1.05142188, + "balance_loss_mlp": 1.00124288, + "epoch": 0.36842421691817473, + "flos": 70380999987840.0, + "grad_norm": 1.0340933440904594, + "language_loss": 0.64231312, + "learning_rate": 2.914291309397769e-06, + "loss": 0.66246951, + "num_input_tokens_seen": 65787820, + "step": 3064, + "time_per_iteration": 4.348371505737305 + }, + { + "auxiliary_loss_clip": 0.01098842, + "auxiliary_loss_mlp": 0.01085393, + "balance_loss_clip": 1.02906394, + "balance_loss_mlp": 1.00437808, + "epoch": 0.3685444598088138, + "flos": 23331378510720.0, + "grad_norm": 2.1277890714138827, + "language_loss": 0.78214574, + "learning_rate": 2.9135984290376117e-06, + "loss": 0.80398804, + "num_input_tokens_seen": 65806685, + "step": 3065, + "time_per_iteration": 2.941007137298584 + }, + { + "auxiliary_loss_clip": 0.01104807, + "auxiliary_loss_mlp": 0.01085356, + "balance_loss_clip": 1.0333153, + "balance_loss_mlp": 1.00448453, + "epoch": 0.3686647026994529, + "flos": 23070271570560.0, + "grad_norm": 1.8016873525873949, + "language_loss": 0.82582188, + "learning_rate": 2.9129054100853e-06, + "loss": 0.84772348, + "num_input_tokens_seen": 65825525, + "step": 3066, + "time_per_iteration": 2.8682913780212402 + }, + { + "auxiliary_loss_clip": 0.01130051, + "auxiliary_loss_mlp": 0.01087375, + "balance_loss_clip": 1.03376627, + "balance_loss_mlp": 1.00626528, + "epoch": 0.368784945590092, + "flos": 25119909440640.0, + "grad_norm": 1.7950553255391524, + "language_loss": 0.76065326, + "learning_rate": 2.912212252645963e-06, + "loss": 0.78282762, + "num_input_tokens_seen": 65848110, + "step": 3067, + "time_per_iteration": 3.72965669631958 + }, + { + "auxiliary_loss_clip": 0.0114135, + "auxiliary_loss_mlp": 0.01084744, + "balance_loss_clip": 1.03514981, + "balance_loss_mlp": 1.00363445, + "epoch": 0.36890518848073106, + "flos": 18442284566400.0, + "grad_norm": 2.0098269391640873, + "language_loss": 0.76339096, + "learning_rate": 2.9115189568247523e-06, + "loss": 0.78565192, + "num_input_tokens_seen": 65865670, + "step": 3068, + "time_per_iteration": 2.6715567111968994 + }, + { + "auxiliary_loss_clip": 0.01095304, + "auxiliary_loss_mlp": 0.01086628, + "balance_loss_clip": 1.02631164, + "balance_loss_mlp": 1.00570917, + "epoch": 0.36902543137137017, + "flos": 16362446336640.0, + "grad_norm": 2.9644899179913438, + "language_loss": 0.91851586, + "learning_rate": 2.910825522726841e-06, + "loss": 0.94033515, + "num_input_tokens_seen": 65883195, + "step": 3069, + "time_per_iteration": 2.789181709289551 + }, + { + "auxiliary_loss_clip": 0.01113791, + "auxiliary_loss_mlp": 0.01085766, + "balance_loss_clip": 1.03379035, + "balance_loss_mlp": 1.00479901, + "epoch": 0.3691456742620093, + "flos": 12275596702080.0, + "grad_norm": 2.059629424829178, + "language_loss": 0.76811099, + "learning_rate": 2.9101319504574215e-06, + "loss": 0.79010665, + "num_input_tokens_seen": 65899635, + "step": 3070, + "time_per_iteration": 3.7397024631500244 + }, + { + "auxiliary_loss_clip": 0.01140994, + "auxiliary_loss_mlp": 0.01085679, + "balance_loss_clip": 1.03480649, + "balance_loss_mlp": 1.00456882, + "epoch": 0.36926591715264834, + "flos": 17786412178560.0, + "grad_norm": 1.6469981353352048, + "language_loss": 0.76049733, + "learning_rate": 2.909438240121709e-06, + "loss": 0.78276408, + "num_input_tokens_seen": 65919910, + "step": 3071, + "time_per_iteration": 2.730668783187866 + }, + { + "auxiliary_loss_clip": 0.01123083, + "auxiliary_loss_mlp": 0.01085775, + "balance_loss_clip": 1.03405952, + "balance_loss_mlp": 1.004951, + "epoch": 0.36938616004328745, + "flos": 28948309741440.0, + "grad_norm": 1.8775016989871915, + "language_loss": 0.70541823, + "learning_rate": 2.908744391824939e-06, + "loss": 0.72750676, + "num_input_tokens_seen": 65940930, + "step": 3072, + "time_per_iteration": 3.751483201980591 + }, + { + "auxiliary_loss_clip": 0.01097818, + "auxiliary_loss_mlp": 0.0108526, + "balance_loss_clip": 1.02786839, + "balance_loss_mlp": 1.00424552, + "epoch": 0.36950640293392656, + "flos": 29205394358400.0, + "grad_norm": 1.8379615098831592, + "language_loss": 0.79363179, + "learning_rate": 2.908050405672367e-06, + "loss": 0.81546259, + "num_input_tokens_seen": 65960475, + "step": 3073, + "time_per_iteration": 2.929776191711426 + }, + { + "auxiliary_loss_clip": 0.01133745, + "auxiliary_loss_mlp": 0.01086243, + "balance_loss_clip": 1.03506482, + "balance_loss_mlp": 1.00532389, + "epoch": 0.3696266458245656, + "flos": 24827776128000.0, + "grad_norm": 1.6869963289456533, + "language_loss": 0.79167926, + "learning_rate": 2.9073562817692703e-06, + "loss": 0.81387913, + "num_input_tokens_seen": 65979160, + "step": 3074, + "time_per_iteration": 2.7979116439819336 + }, + { + "auxiliary_loss_clip": 0.01118171, + "auxiliary_loss_mlp": 0.01079936, + "balance_loss_clip": 1.05101132, + "balance_loss_mlp": 1.00059021, + "epoch": 0.3697468887152047, + "flos": 59887257264000.0, + "grad_norm": 0.7209268906164661, + "language_loss": 0.5652684, + "learning_rate": 2.9066620202209468e-06, + "loss": 0.5872494, + "num_input_tokens_seen": 66041650, + "step": 3075, + "time_per_iteration": 3.3021838665008545 + }, + { + "auxiliary_loss_clip": 0.01123389, + "auxiliary_loss_mlp": 0.01085001, + "balance_loss_clip": 1.03562248, + "balance_loss_mlp": 1.00417686, + "epoch": 0.3698671316058438, + "flos": 26137581569280.0, + "grad_norm": 1.833780538466884, + "language_loss": 0.77817166, + "learning_rate": 2.905967621132716e-06, + "loss": 0.80025554, + "num_input_tokens_seen": 66059260, + "step": 3076, + "time_per_iteration": 2.8242509365081787 + }, + { + "auxiliary_loss_clip": 0.01131653, + "auxiliary_loss_mlp": 0.01085044, + "balance_loss_clip": 1.03488576, + "balance_loss_mlp": 1.00388622, + "epoch": 0.3699873744964829, + "flos": 24607464059520.0, + "grad_norm": 2.6498036549117225, + "language_loss": 0.74919081, + "learning_rate": 2.9052730846099172e-06, + "loss": 0.77135783, + "num_input_tokens_seen": 66080605, + "step": 3077, + "time_per_iteration": 2.740640640258789 + }, + { + "auxiliary_loss_clip": 0.01133152, + "auxiliary_loss_mlp": 0.01079577, + "balance_loss_clip": 1.05041802, + "balance_loss_mlp": 1.00023162, + "epoch": 0.370107617387122, + "flos": 64885340050560.0, + "grad_norm": 0.8614826152763114, + "language_loss": 0.6095292, + "learning_rate": 2.9045784107579123e-06, + "loss": 0.63165653, + "num_input_tokens_seen": 66140710, + "step": 3078, + "time_per_iteration": 3.299844980239868 + }, + { + "auxiliary_loss_clip": 0.01151731, + "auxiliary_loss_mlp": 0.01085639, + "balance_loss_clip": 1.03764307, + "balance_loss_mlp": 1.0047673, + "epoch": 0.37022786027776106, + "flos": 15961683317760.0, + "grad_norm": 1.8196442051812929, + "language_loss": 0.67106527, + "learning_rate": 2.9038835996820807e-06, + "loss": 0.69343895, + "num_input_tokens_seen": 66158320, + "step": 3079, + "time_per_iteration": 2.6493239402770996 + }, + { + "auxiliary_loss_clip": 0.01126056, + "auxiliary_loss_mlp": 0.01088197, + "balance_loss_clip": 1.03583837, + "balance_loss_mlp": 1.00732589, + "epoch": 0.37034810316840017, + "flos": 18546927863040.0, + "grad_norm": 1.9695875524041486, + "language_loss": 0.79478002, + "learning_rate": 2.903188651487826e-06, + "loss": 0.81692261, + "num_input_tokens_seen": 66176875, + "step": 3080, + "time_per_iteration": 2.741687059402466 + }, + { + "auxiliary_loss_clip": 0.01127857, + "auxiliary_loss_mlp": 0.01085107, + "balance_loss_clip": 1.03696918, + "balance_loss_mlp": 1.00423563, + "epoch": 0.3704683460590393, + "flos": 17821927751040.0, + "grad_norm": 2.3624837083964723, + "language_loss": 0.86564589, + "learning_rate": 2.902493566280571e-06, + "loss": 0.88777554, + "num_input_tokens_seen": 66194980, + "step": 3081, + "time_per_iteration": 2.647855281829834 + }, + { + "auxiliary_loss_clip": 0.01122209, + "auxiliary_loss_mlp": 0.01086236, + "balance_loss_clip": 1.03197682, + "balance_loss_mlp": 1.0053165, + "epoch": 0.37058858894967833, + "flos": 14134081368960.0, + "grad_norm": 2.156754530815798, + "language_loss": 0.81084037, + "learning_rate": 2.9017983441657595e-06, + "loss": 0.83292484, + "num_input_tokens_seen": 66212310, + "step": 3082, + "time_per_iteration": 2.796354055404663 + }, + { + "auxiliary_loss_clip": 0.01111497, + "auxiliary_loss_mlp": 0.0108776, + "balance_loss_clip": 1.03171563, + "balance_loss_mlp": 1.00684094, + "epoch": 0.37070883184031744, + "flos": 13954492344960.0, + "grad_norm": 1.9253749845902433, + "language_loss": 0.75182247, + "learning_rate": 2.9011029852488564e-06, + "loss": 0.77381498, + "num_input_tokens_seen": 66229545, + "step": 3083, + "time_per_iteration": 2.756098508834839 + }, + { + "auxiliary_loss_clip": 0.01149791, + "auxiliary_loss_mlp": 0.01079556, + "balance_loss_clip": 1.05060887, + "balance_loss_mlp": 1.00020981, + "epoch": 0.37082907473095655, + "flos": 52315419306240.0, + "grad_norm": 1.011409756189986, + "language_loss": 0.62461281, + "learning_rate": 2.9004074896353465e-06, + "loss": 0.64690632, + "num_input_tokens_seen": 66283545, + "step": 3084, + "time_per_iteration": 3.218221426010132 + }, + { + "auxiliary_loss_clip": 0.01153837, + "auxiliary_loss_mlp": 0.01087248, + "balance_loss_clip": 1.03978419, + "balance_loss_mlp": 1.00642431, + "epoch": 0.3709493176215956, + "flos": 15998096730240.0, + "grad_norm": 1.7678328099310516, + "language_loss": 0.81438506, + "learning_rate": 2.8997118574307362e-06, + "loss": 0.83679593, + "num_input_tokens_seen": 66300500, + "step": 3085, + "time_per_iteration": 2.7441341876983643 + }, + { + "auxiliary_loss_clip": 0.01109261, + "auxiliary_loss_mlp": 0.01086758, + "balance_loss_clip": 1.03549099, + "balance_loss_mlp": 1.00588679, + "epoch": 0.3710695605122347, + "flos": 20959837931520.0, + "grad_norm": 1.7509676797321456, + "language_loss": 0.73842597, + "learning_rate": 2.899016088740553e-06, + "loss": 0.76038617, + "num_input_tokens_seen": 66318610, + "step": 3086, + "time_per_iteration": 2.7596704959869385 + }, + { + "auxiliary_loss_clip": 0.01108329, + "auxiliary_loss_mlp": 0.01085979, + "balance_loss_clip": 1.02993488, + "balance_loss_mlp": 1.00515556, + "epoch": 0.37118980340287383, + "flos": 14355578586240.0, + "grad_norm": 1.8619419421561918, + "language_loss": 0.78918219, + "learning_rate": 2.898320183670344e-06, + "loss": 0.81112528, + "num_input_tokens_seen": 66336025, + "step": 3087, + "time_per_iteration": 2.8117282390594482 + }, + { + "auxiliary_loss_clip": 0.01102843, + "auxiliary_loss_mlp": 0.01085902, + "balance_loss_clip": 1.02669692, + "balance_loss_mlp": 1.00503039, + "epoch": 0.3713100462935129, + "flos": 25885381201920.0, + "grad_norm": 2.0590935887490676, + "language_loss": 0.8862586, + "learning_rate": 2.8976241423256767e-06, + "loss": 0.90814602, + "num_input_tokens_seen": 66356120, + "step": 3088, + "time_per_iteration": 2.817880868911743 + }, + { + "auxiliary_loss_clip": 0.01135838, + "auxiliary_loss_mlp": 0.01087058, + "balance_loss_clip": 1.03792167, + "balance_loss_mlp": 1.0063293, + "epoch": 0.371430289184152, + "flos": 30518934814080.0, + "grad_norm": 2.418136242962453, + "language_loss": 0.68140101, + "learning_rate": 2.896927964812142e-06, + "loss": 0.70362997, + "num_input_tokens_seen": 66376685, + "step": 3089, + "time_per_iteration": 3.717272996902466 + }, + { + "auxiliary_loss_clip": 0.01125698, + "auxiliary_loss_mlp": 0.01085252, + "balance_loss_clip": 1.03103817, + "balance_loss_mlp": 1.00442767, + "epoch": 0.37155053207479105, + "flos": 15742233175680.0, + "grad_norm": 2.87830844768005, + "language_loss": 0.74739259, + "learning_rate": 2.8962316512353465e-06, + "loss": 0.76950204, + "num_input_tokens_seen": 66394230, + "step": 3090, + "time_per_iteration": 2.7880141735076904 + }, + { + "auxiliary_loss_clip": 0.01105832, + "auxiliary_loss_mlp": 0.01085752, + "balance_loss_clip": 1.03390098, + "balance_loss_mlp": 1.00488055, + "epoch": 0.37167077496543016, + "flos": 23404061681280.0, + "grad_norm": 1.5885746563689305, + "language_loss": 0.75148237, + "learning_rate": 2.8955352017009233e-06, + "loss": 0.77339828, + "num_input_tokens_seen": 66413475, + "step": 3091, + "time_per_iteration": 2.896967649459839 + }, + { + "auxiliary_loss_clip": 0.01128986, + "auxiliary_loss_mlp": 0.01086739, + "balance_loss_clip": 1.03337634, + "balance_loss_mlp": 1.00572455, + "epoch": 0.3717910178560693, + "flos": 22088653718400.0, + "grad_norm": 1.8724910719828536, + "language_loss": 0.7715162, + "learning_rate": 2.8948386163145212e-06, + "loss": 0.79367352, + "num_input_tokens_seen": 66432685, + "step": 3092, + "time_per_iteration": 3.6760058403015137 + }, + { + "auxiliary_loss_clip": 0.01128842, + "auxiliary_loss_mlp": 0.01084983, + "balance_loss_clip": 1.03765666, + "balance_loss_mlp": 1.0042069, + "epoch": 0.3719112607467083, + "flos": 26939969533440.0, + "grad_norm": 2.061663517453791, + "language_loss": 0.79597515, + "learning_rate": 2.8941418951818135e-06, + "loss": 0.81811345, + "num_input_tokens_seen": 66452245, + "step": 3093, + "time_per_iteration": 2.716201066970825 + }, + { + "auxiliary_loss_clip": 0.01119602, + "auxiliary_loss_mlp": 0.01087347, + "balance_loss_clip": 1.03196108, + "balance_loss_mlp": 1.00657105, + "epoch": 0.37203150363734744, + "flos": 12166500119040.0, + "grad_norm": 2.3886355351224178, + "language_loss": 0.71564764, + "learning_rate": 2.8934450384084903e-06, + "loss": 0.73771721, + "num_input_tokens_seen": 66469760, + "step": 3094, + "time_per_iteration": 2.7238643169403076 + }, + { + "auxiliary_loss_clip": 0.01129592, + "auxiliary_loss_mlp": 0.01086376, + "balance_loss_clip": 1.03223968, + "balance_loss_mlp": 1.00521851, + "epoch": 0.37215174652798655, + "flos": 23697595624320.0, + "grad_norm": 1.8635202697624254, + "language_loss": 0.69551122, + "learning_rate": 2.8927480461002653e-06, + "loss": 0.71767092, + "num_input_tokens_seen": 66489730, + "step": 3095, + "time_per_iteration": 2.8121092319488525 + }, + { + "auxiliary_loss_clip": 0.01134206, + "auxiliary_loss_mlp": 0.01087181, + "balance_loss_clip": 1.03574181, + "balance_loss_mlp": 1.00607109, + "epoch": 0.3722719894186256, + "flos": 17887751424000.0, + "grad_norm": 2.2740541090140636, + "language_loss": 0.85534316, + "learning_rate": 2.892050918362872e-06, + "loss": 0.87755704, + "num_input_tokens_seen": 66504785, + "step": 3096, + "time_per_iteration": 3.631005048751831 + }, + { + "auxiliary_loss_clip": 0.01091336, + "auxiliary_loss_mlp": 0.01079921, + "balance_loss_clip": 1.04235542, + "balance_loss_mlp": 1.00057507, + "epoch": 0.3723922323092647, + "flos": 62419891363200.0, + "grad_norm": 0.8491665139786293, + "language_loss": 0.55932337, + "learning_rate": 2.8913536553020626e-06, + "loss": 0.58103597, + "num_input_tokens_seen": 66558840, + "step": 3097, + "time_per_iteration": 4.293347597122192 + }, + { + "auxiliary_loss_clip": 0.01114241, + "auxiliary_loss_mlp": 0.01085494, + "balance_loss_clip": 1.03341806, + "balance_loss_mlp": 1.00481272, + "epoch": 0.3725124751999038, + "flos": 23039747988480.0, + "grad_norm": 2.8305647537022978, + "language_loss": 0.84486401, + "learning_rate": 2.8906562570236137e-06, + "loss": 0.86686134, + "num_input_tokens_seen": 66576750, + "step": 3098, + "time_per_iteration": 2.8991312980651855 + }, + { + "auxiliary_loss_clip": 0.01104891, + "auxiliary_loss_mlp": 0.01086643, + "balance_loss_clip": 1.03284049, + "balance_loss_mlp": 1.00581932, + "epoch": 0.3726327180905429, + "flos": 20920551431040.0, + "grad_norm": 1.5286634137659934, + "language_loss": 0.76431572, + "learning_rate": 2.889958723633318e-06, + "loss": 0.78623104, + "num_input_tokens_seen": 66595690, + "step": 3099, + "time_per_iteration": 2.918506622314453 + }, + { + "auxiliary_loss_clip": 0.01121167, + "auxiliary_loss_mlp": 0.01085176, + "balance_loss_clip": 1.03306198, + "balance_loss_mlp": 1.00420916, + "epoch": 0.372752960981182, + "flos": 30592156688640.0, + "grad_norm": 1.5414022984373976, + "language_loss": 0.73563373, + "learning_rate": 2.889261055236992e-06, + "loss": 0.75769711, + "num_input_tokens_seen": 66617905, + "step": 3100, + "time_per_iteration": 2.8411941528320312 + }, + { + "auxiliary_loss_clip": 0.01128048, + "auxiliary_loss_mlp": 0.01086417, + "balance_loss_clip": 1.03367341, + "balance_loss_mlp": 1.00549817, + "epoch": 0.3728732038718211, + "flos": 25116749043840.0, + "grad_norm": 1.794973425331519, + "language_loss": 0.82515919, + "learning_rate": 2.8885632519404704e-06, + "loss": 0.84730387, + "num_input_tokens_seen": 66638175, + "step": 3101, + "time_per_iteration": 2.8041694164276123 + }, + { + "auxiliary_loss_clip": 0.011269, + "auxiliary_loss_mlp": 0.01085277, + "balance_loss_clip": 1.03120518, + "balance_loss_mlp": 1.00426221, + "epoch": 0.37299344676246016, + "flos": 25302048330240.0, + "grad_norm": 2.0883115714627247, + "language_loss": 0.75911278, + "learning_rate": 2.8878653138496107e-06, + "loss": 0.78123456, + "num_input_tokens_seen": 66658670, + "step": 3102, + "time_per_iteration": 2.7798492908477783 + }, + { + "auxiliary_loss_clip": 0.01090702, + "auxiliary_loss_mlp": 0.01086351, + "balance_loss_clip": 1.03327167, + "balance_loss_mlp": 1.00538397, + "epoch": 0.37311368965309927, + "flos": 23842531002240.0, + "grad_norm": 2.7481359759825885, + "language_loss": 0.76303351, + "learning_rate": 2.8871672410702878e-06, + "loss": 0.78480399, + "num_input_tokens_seen": 66676030, + "step": 3103, + "time_per_iteration": 2.921762704849243 + }, + { + "auxiliary_loss_clip": 0.01122594, + "auxiliary_loss_mlp": 0.01087772, + "balance_loss_clip": 1.03376317, + "balance_loss_mlp": 1.00661469, + "epoch": 0.3732339325437384, + "flos": 25811943845760.0, + "grad_norm": 1.6567784145815945, + "language_loss": 0.81702119, + "learning_rate": 2.8864690337084008e-06, + "loss": 0.83912492, + "num_input_tokens_seen": 66695305, + "step": 3104, + "time_per_iteration": 2.795027732849121 + }, + { + "auxiliary_loss_clip": 0.01143261, + "auxiliary_loss_mlp": 0.01085239, + "balance_loss_clip": 1.03654861, + "balance_loss_mlp": 1.00436735, + "epoch": 0.37335417543437743, + "flos": 26208433146240.0, + "grad_norm": 1.671286022969959, + "language_loss": 0.77783895, + "learning_rate": 2.885770691869866e-06, + "loss": 0.80012393, + "num_input_tokens_seen": 66716185, + "step": 3105, + "time_per_iteration": 2.8189709186553955 + }, + { + "auxiliary_loss_clip": 0.01140997, + "auxiliary_loss_mlp": 0.01084805, + "balance_loss_clip": 1.03641653, + "balance_loss_mlp": 1.00402939, + "epoch": 0.37347441832501654, + "flos": 24023879792640.0, + "grad_norm": 2.5178995420065076, + "language_loss": 0.7467885, + "learning_rate": 2.8850722156606207e-06, + "loss": 0.76904649, + "num_input_tokens_seen": 66734575, + "step": 3106, + "time_per_iteration": 2.74288010597229 + }, + { + "auxiliary_loss_clip": 0.01142915, + "auxiliary_loss_mlp": 0.01086921, + "balance_loss_clip": 1.03720105, + "balance_loss_mlp": 1.00590611, + "epoch": 0.3735946612156556, + "flos": 19714922409600.0, + "grad_norm": 2.017041438466206, + "language_loss": 0.67222619, + "learning_rate": 2.8843736051866252e-06, + "loss": 0.69452447, + "num_input_tokens_seen": 66753500, + "step": 3107, + "time_per_iteration": 2.6935081481933594 + }, + { + "auxiliary_loss_clip": 0.0111089, + "auxiliary_loss_mlp": 0.00873373, + "balance_loss_clip": 1.03271985, + "balance_loss_mlp": 1.00034225, + "epoch": 0.3737149041062947, + "flos": 23039604334080.0, + "grad_norm": 1.5462133996510325, + "language_loss": 0.69375706, + "learning_rate": 2.8836748605538557e-06, + "loss": 0.71359968, + "num_input_tokens_seen": 66775140, + "step": 3108, + "time_per_iteration": 2.864231824874878 + }, + { + "auxiliary_loss_clip": 0.01131393, + "auxiliary_loss_mlp": 0.01085879, + "balance_loss_clip": 1.03418529, + "balance_loss_mlp": 1.00486457, + "epoch": 0.3738351469969338, + "flos": 34678108483200.0, + "grad_norm": 2.404143821959309, + "language_loss": 0.63221467, + "learning_rate": 2.882975981868313e-06, + "loss": 0.65438741, + "num_input_tokens_seen": 66795525, + "step": 3109, + "time_per_iteration": 2.918832778930664 + }, + { + "auxiliary_loss_clip": 0.01133912, + "auxiliary_loss_mlp": 0.01087511, + "balance_loss_clip": 1.03473401, + "balance_loss_mlp": 1.00640106, + "epoch": 0.3739553898875729, + "flos": 43507967448960.0, + "grad_norm": 2.6054191893101883, + "language_loss": 0.68307006, + "learning_rate": 2.882276969236016e-06, + "loss": 0.7052843, + "num_input_tokens_seen": 66816885, + "step": 3110, + "time_per_iteration": 2.935309410095215 + }, + { + "auxiliary_loss_clip": 0.01129641, + "auxiliary_loss_mlp": 0.01088185, + "balance_loss_clip": 1.03369951, + "balance_loss_mlp": 1.00726604, + "epoch": 0.374075632778212, + "flos": 12856487448960.0, + "grad_norm": 1.8696729799636083, + "language_loss": 0.76238626, + "learning_rate": 2.881577822763005e-06, + "loss": 0.7845645, + "num_input_tokens_seen": 66834835, + "step": 3111, + "time_per_iteration": 2.768016815185547 + }, + { + "auxiliary_loss_clip": 0.01140134, + "auxiliary_loss_mlp": 0.01085919, + "balance_loss_clip": 1.03487659, + "balance_loss_mlp": 1.00499952, + "epoch": 0.3741958756688511, + "flos": 26024031699840.0, + "grad_norm": 2.544795084329531, + "language_loss": 0.87801975, + "learning_rate": 2.880878542555338e-06, + "loss": 0.9002803, + "num_input_tokens_seen": 66852600, + "step": 3112, + "time_per_iteration": 2.776273250579834 + }, + { + "auxiliary_loss_clip": 0.01149387, + "auxiliary_loss_mlp": 0.01087242, + "balance_loss_clip": 1.0357554, + "balance_loss_mlp": 1.00617933, + "epoch": 0.37431611855949015, + "flos": 21433894652160.0, + "grad_norm": 2.1399126385034206, + "language_loss": 0.80015516, + "learning_rate": 2.8801791287190976e-06, + "loss": 0.82252145, + "num_input_tokens_seen": 66870595, + "step": 3113, + "time_per_iteration": 2.679262399673462 + }, + { + "auxiliary_loss_clip": 0.01140097, + "auxiliary_loss_mlp": 0.01085436, + "balance_loss_clip": 1.03511024, + "balance_loss_mlp": 1.00437331, + "epoch": 0.37443636145012926, + "flos": 24207096090240.0, + "grad_norm": 2.5909795918294516, + "language_loss": 0.86123371, + "learning_rate": 2.8794795813603817e-06, + "loss": 0.88348901, + "num_input_tokens_seen": 66886060, + "step": 3114, + "time_per_iteration": 3.550638198852539 + }, + { + "auxiliary_loss_clip": 0.01127054, + "auxiliary_loss_mlp": 0.01087757, + "balance_loss_clip": 1.03590715, + "balance_loss_mlp": 1.00674212, + "epoch": 0.3745566043407684, + "flos": 15378601841280.0, + "grad_norm": 1.935673893174875, + "language_loss": 0.81514525, + "learning_rate": 2.878779900585314e-06, + "loss": 0.83729339, + "num_input_tokens_seen": 66903900, + "step": 3115, + "time_per_iteration": 2.658168077468872 + }, + { + "auxiliary_loss_clip": 0.01133667, + "auxiliary_loss_mlp": 0.01085641, + "balance_loss_clip": 1.03577268, + "balance_loss_mlp": 1.00486493, + "epoch": 0.37467684723140743, + "flos": 24608218245120.0, + "grad_norm": 1.4825970350490836, + "language_loss": 0.75318944, + "learning_rate": 2.8780800865000336e-06, + "loss": 0.77538252, + "num_input_tokens_seen": 66925210, + "step": 3116, + "time_per_iteration": 2.820620536804199 + }, + { + "auxiliary_loss_clip": 0.01134223, + "auxiliary_loss_mlp": 0.01079585, + "balance_loss_clip": 1.0436666, + "balance_loss_mlp": 1.00023973, + "epoch": 0.37479709012204654, + "flos": 64377491610240.0, + "grad_norm": 0.9940893726988331, + "language_loss": 0.59177232, + "learning_rate": 2.877380139210702e-06, + "loss": 0.61391044, + "num_input_tokens_seen": 66983880, + "step": 3117, + "time_per_iteration": 4.164197206497192 + }, + { + "auxiliary_loss_clip": 0.01099822, + "auxiliary_loss_mlp": 0.01087575, + "balance_loss_clip": 1.03241146, + "balance_loss_mlp": 1.0065124, + "epoch": 0.37491733301268565, + "flos": 23803962773760.0, + "grad_norm": 2.4051108580245817, + "language_loss": 0.76210082, + "learning_rate": 2.876680058823501e-06, + "loss": 0.78397477, + "num_input_tokens_seen": 67004280, + "step": 3118, + "time_per_iteration": 2.7753472328186035 + }, + { + "auxiliary_loss_clip": 0.0112733, + "auxiliary_loss_mlp": 0.01086223, + "balance_loss_clip": 1.03196597, + "balance_loss_mlp": 1.00525606, + "epoch": 0.3750375759033247, + "flos": 32160950167680.0, + "grad_norm": 1.9297348302542299, + "language_loss": 0.66238874, + "learning_rate": 2.8759798454446314e-06, + "loss": 0.6845243, + "num_input_tokens_seen": 67027445, + "step": 3119, + "time_per_iteration": 2.8069915771484375 + }, + { + "auxiliary_loss_clip": 0.01141232, + "auxiliary_loss_mlp": 0.01085689, + "balance_loss_clip": 1.03531051, + "balance_loss_mlp": 1.00472176, + "epoch": 0.3751578187939638, + "flos": 23367791923200.0, + "grad_norm": 1.9179367691484475, + "language_loss": 0.81417698, + "learning_rate": 2.8752794991803173e-06, + "loss": 0.83644617, + "num_input_tokens_seen": 67045130, + "step": 3120, + "time_per_iteration": 2.7347805500030518 + }, + { + "auxiliary_loss_clip": 0.01132017, + "auxiliary_loss_mlp": 0.01087094, + "balance_loss_clip": 1.03616881, + "balance_loss_mlp": 1.00631785, + "epoch": 0.37527806168460287, + "flos": 14605731878400.0, + "grad_norm": 2.1535524784795212, + "language_loss": 0.74816889, + "learning_rate": 2.8745790201367976e-06, + "loss": 0.77035999, + "num_input_tokens_seen": 67060885, + "step": 3121, + "time_per_iteration": 3.5545825958251953 + }, + { + "auxiliary_loss_clip": 0.01150147, + "auxiliary_loss_mlp": 0.01086431, + "balance_loss_clip": 1.03569496, + "balance_loss_mlp": 1.00541615, + "epoch": 0.375398304575242, + "flos": 26390823431040.0, + "grad_norm": 2.167205534984747, + "language_loss": 0.84321338, + "learning_rate": 2.8738784084203373e-06, + "loss": 0.86557913, + "num_input_tokens_seen": 67080960, + "step": 3122, + "time_per_iteration": 2.7438292503356934 + }, + { + "auxiliary_loss_clip": 0.0113437, + "auxiliary_loss_mlp": 0.01086985, + "balance_loss_clip": 1.03651094, + "balance_loss_mlp": 1.00611317, + "epoch": 0.3755185474658811, + "flos": 22236605838720.0, + "grad_norm": 1.6650866306013417, + "language_loss": 0.7864368, + "learning_rate": 2.873177664137216e-06, + "loss": 0.80865037, + "num_input_tokens_seen": 67101890, + "step": 3123, + "time_per_iteration": 3.6526997089385986 + }, + { + "auxiliary_loss_clip": 0.01110468, + "auxiliary_loss_mlp": 0.01086081, + "balance_loss_clip": 1.03037024, + "balance_loss_mlp": 1.00511384, + "epoch": 0.37563879035652015, + "flos": 30812935633920.0, + "grad_norm": 2.118728302867883, + "language_loss": 0.69151354, + "learning_rate": 2.8724767873937384e-06, + "loss": 0.71347904, + "num_input_tokens_seen": 67126010, + "step": 3124, + "time_per_iteration": 2.8755128383636475 + }, + { + "auxiliary_loss_clip": 0.01114297, + "auxiliary_loss_mlp": 0.01085511, + "balance_loss_clip": 1.03371108, + "balance_loss_mlp": 1.00468707, + "epoch": 0.37575903324715926, + "flos": 20773533064320.0, + "grad_norm": 2.0300769719988843, + "language_loss": 0.87033242, + "learning_rate": 2.871775778296225e-06, + "loss": 0.89233053, + "num_input_tokens_seen": 67143100, + "step": 3125, + "time_per_iteration": 2.770016670227051 + }, + { + "auxiliary_loss_clip": 0.01133805, + "auxiliary_loss_mlp": 0.01086254, + "balance_loss_clip": 1.03074825, + "balance_loss_mlp": 1.00528717, + "epoch": 0.37587927613779837, + "flos": 18697681244160.0, + "grad_norm": 2.180328268437429, + "language_loss": 0.78101158, + "learning_rate": 2.8710746369510196e-06, + "loss": 0.80321217, + "num_input_tokens_seen": 67161085, + "step": 3126, + "time_per_iteration": 2.697129726409912 + }, + { + "auxiliary_loss_clip": 0.0113267, + "auxiliary_loss_mlp": 0.01087002, + "balance_loss_clip": 1.03476834, + "balance_loss_mlp": 1.00603485, + "epoch": 0.3759995190284374, + "flos": 13624796384640.0, + "grad_norm": 2.617309826514059, + "language_loss": 0.83173227, + "learning_rate": 2.8703733634644846e-06, + "loss": 0.85392904, + "num_input_tokens_seen": 67175840, + "step": 3127, + "time_per_iteration": 2.801234006881714 + }, + { + "auxiliary_loss_clip": 0.01150311, + "auxiliary_loss_mlp": 0.0108653, + "balance_loss_clip": 1.03657055, + "balance_loss_mlp": 1.00575352, + "epoch": 0.37611976191907653, + "flos": 20484847457280.0, + "grad_norm": 1.665572259953318, + "language_loss": 0.79174322, + "learning_rate": 2.869671957943002e-06, + "loss": 0.81411159, + "num_input_tokens_seen": 67194995, + "step": 3128, + "time_per_iteration": 2.6660728454589844 + }, + { + "auxiliary_loss_clip": 0.01122487, + "auxiliary_loss_mlp": 0.0108565, + "balance_loss_clip": 1.02945995, + "balance_loss_mlp": 1.00492167, + "epoch": 0.37624000480971564, + "flos": 21141797253120.0, + "grad_norm": 1.839199756308567, + "language_loss": 0.73939073, + "learning_rate": 2.8689704204929747e-06, + "loss": 0.76147211, + "num_input_tokens_seen": 67214175, + "step": 3129, + "time_per_iteration": 2.674785614013672 + }, + { + "auxiliary_loss_clip": 0.01149133, + "auxiliary_loss_mlp": 0.01085732, + "balance_loss_clip": 1.0349263, + "balance_loss_mlp": 1.00481296, + "epoch": 0.3763602477003547, + "flos": 22564470205440.0, + "grad_norm": 1.7914135857314581, + "language_loss": 0.80838245, + "learning_rate": 2.8682687512208253e-06, + "loss": 0.83073103, + "num_input_tokens_seen": 67233185, + "step": 3130, + "time_per_iteration": 2.6461806297302246 + }, + { + "auxiliary_loss_clip": 0.01141872, + "auxiliary_loss_mlp": 0.01085615, + "balance_loss_clip": 1.03489339, + "balance_loss_mlp": 1.00455308, + "epoch": 0.3764804905909938, + "flos": 27526857851520.0, + "grad_norm": 1.9541445340184564, + "language_loss": 0.80746371, + "learning_rate": 2.8675669502329972e-06, + "loss": 0.82973862, + "num_input_tokens_seen": 67254715, + "step": 3131, + "time_per_iteration": 2.731564998626709 + }, + { + "auxiliary_loss_clip": 0.01136534, + "auxiliary_loss_mlp": 0.00873374, + "balance_loss_clip": 1.03173792, + "balance_loss_mlp": 1.00038707, + "epoch": 0.3766007334816329, + "flos": 22528092706560.0, + "grad_norm": 2.230526118276141, + "language_loss": 0.85446823, + "learning_rate": 2.866865017635952e-06, + "loss": 0.87456727, + "num_input_tokens_seen": 67272535, + "step": 3132, + "time_per_iteration": 2.6570937633514404 + }, + { + "auxiliary_loss_clip": 0.01112054, + "auxiliary_loss_mlp": 0.0108696, + "balance_loss_clip": 1.02782702, + "balance_loss_mlp": 1.00589752, + "epoch": 0.376720976372272, + "flos": 25957166532480.0, + "grad_norm": 1.5469093094595512, + "language_loss": 0.79388243, + "learning_rate": 2.866162953536174e-06, + "loss": 0.81587261, + "num_input_tokens_seen": 67293505, + "step": 3133, + "time_per_iteration": 2.868767738342285 + }, + { + "auxiliary_loss_clip": 0.01133048, + "auxiliary_loss_mlp": 0.00873458, + "balance_loss_clip": 1.03511238, + "balance_loss_mlp": 1.00054622, + "epoch": 0.3768412192629111, + "flos": 18041162411520.0, + "grad_norm": 1.5712758135266152, + "language_loss": 0.74748391, + "learning_rate": 2.8654607580401634e-06, + "loss": 0.76754898, + "num_input_tokens_seen": 67313240, + "step": 3134, + "time_per_iteration": 2.87092924118042 + }, + { + "auxiliary_loss_clip": 0.01131569, + "auxiliary_loss_mlp": 0.01079669, + "balance_loss_clip": 1.04122329, + "balance_loss_mlp": 1.0003233, + "epoch": 0.3769614621535502, + "flos": 62989472304000.0, + "grad_norm": 0.9064843189789276, + "language_loss": 0.65219933, + "learning_rate": 2.8647584312544446e-06, + "loss": 0.6743117, + "num_input_tokens_seen": 67378445, + "step": 3135, + "time_per_iteration": 3.239654064178467 + }, + { + "auxiliary_loss_clip": 0.01107138, + "auxiliary_loss_mlp": 0.00873337, + "balance_loss_clip": 1.0334487, + "balance_loss_mlp": 1.00038302, + "epoch": 0.37708170504418925, + "flos": 23661685002240.0, + "grad_norm": 1.4096731362404826, + "language_loss": 0.8496269, + "learning_rate": 2.864055973285559e-06, + "loss": 0.86943161, + "num_input_tokens_seen": 67400445, + "step": 3136, + "time_per_iteration": 2.8619604110717773 + }, + { + "auxiliary_loss_clip": 0.01129906, + "auxiliary_loss_mlp": 0.01086094, + "balance_loss_clip": 1.03374434, + "balance_loss_mlp": 1.00541317, + "epoch": 0.37720194793482836, + "flos": 24423170353920.0, + "grad_norm": 1.992018309821265, + "language_loss": 0.86409491, + "learning_rate": 2.8633533842400698e-06, + "loss": 0.88625491, + "num_input_tokens_seen": 67420645, + "step": 3137, + "time_per_iteration": 2.7599775791168213 + }, + { + "auxiliary_loss_clip": 0.0113961, + "auxiliary_loss_mlp": 0.00873541, + "balance_loss_clip": 1.03476608, + "balance_loss_mlp": 1.00044978, + "epoch": 0.3773221908254674, + "flos": 20996502739200.0, + "grad_norm": 1.81299193731729, + "language_loss": 0.76737916, + "learning_rate": 2.862650664224558e-06, + "loss": 0.78751063, + "num_input_tokens_seen": 67439495, + "step": 3138, + "time_per_iteration": 2.8070595264434814 + }, + { + "auxiliary_loss_clip": 0.01139532, + "auxiliary_loss_mlp": 0.01086077, + "balance_loss_clip": 1.03441501, + "balance_loss_mlp": 1.00530124, + "epoch": 0.37744243371610653, + "flos": 37631724958080.0, + "grad_norm": 1.4304300467212547, + "language_loss": 0.69938827, + "learning_rate": 2.861947813345627e-06, + "loss": 0.7216444, + "num_input_tokens_seen": 67462195, + "step": 3139, + "time_per_iteration": 3.8056581020355225 + }, + { + "auxiliary_loss_clip": 0.01150366, + "auxiliary_loss_mlp": 0.00873424, + "balance_loss_clip": 1.03610206, + "balance_loss_mlp": 1.00038934, + "epoch": 0.37756267660674564, + "flos": 26140526484480.0, + "grad_norm": 5.50128959681252, + "language_loss": 0.72330678, + "learning_rate": 2.8612448317098974e-06, + "loss": 0.7435447, + "num_input_tokens_seen": 67482530, + "step": 3140, + "time_per_iteration": 2.6774826049804688 + }, + { + "auxiliary_loss_clip": 0.01120481, + "auxiliary_loss_mlp": 0.00873394, + "balance_loss_clip": 1.03356242, + "balance_loss_mlp": 1.0003376, + "epoch": 0.3776829194973847, + "flos": 19427888828160.0, + "grad_norm": 2.0185417455067785, + "language_loss": 0.83067954, + "learning_rate": 2.8605417194240114e-06, + "loss": 0.8506183, + "num_input_tokens_seen": 67500890, + "step": 3141, + "time_per_iteration": 2.835035562515259 + }, + { + "auxiliary_loss_clip": 0.01136668, + "auxiliary_loss_mlp": 0.01085717, + "balance_loss_clip": 1.03310061, + "balance_loss_mlp": 1.00498843, + "epoch": 0.3778031623880238, + "flos": 17382309194880.0, + "grad_norm": 1.8208170947650322, + "language_loss": 0.78533626, + "learning_rate": 2.8598384765946315e-06, + "loss": 0.80756009, + "num_input_tokens_seen": 67519545, + "step": 3142, + "time_per_iteration": 2.749026298522949 + }, + { + "auxiliary_loss_clip": 0.01148793, + "auxiliary_loss_mlp": 0.0108616, + "balance_loss_clip": 1.03412032, + "balance_loss_mlp": 1.0052408, + "epoch": 0.3779234052786629, + "flos": 27125843437440.0, + "grad_norm": 1.7597035335490678, + "language_loss": 0.71859837, + "learning_rate": 2.8591351033284377e-06, + "loss": 0.74094784, + "num_input_tokens_seen": 67539275, + "step": 3143, + "time_per_iteration": 3.6203033924102783 + }, + { + "auxiliary_loss_clip": 0.01124486, + "auxiliary_loss_mlp": 0.01085786, + "balance_loss_clip": 1.03384829, + "balance_loss_mlp": 1.00505781, + "epoch": 0.37804364816930197, + "flos": 19682639061120.0, + "grad_norm": 2.9460366347582734, + "language_loss": 0.83672297, + "learning_rate": 2.8584315997321325e-06, + "loss": 0.85882568, + "num_input_tokens_seen": 67558280, + "step": 3144, + "time_per_iteration": 2.6842710971832275 + }, + { + "auxiliary_loss_clip": 0.01148503, + "auxiliary_loss_mlp": 0.0108626, + "balance_loss_clip": 1.03440356, + "balance_loss_mlp": 1.00543642, + "epoch": 0.3781638910599411, + "flos": 22702905221760.0, + "grad_norm": 2.3250122655883616, + "language_loss": 0.78445148, + "learning_rate": 2.8577279659124356e-06, + "loss": 0.80679905, + "num_input_tokens_seen": 67575955, + "step": 3145, + "time_per_iteration": 2.6518895626068115 + }, + { + "auxiliary_loss_clip": 0.01140348, + "auxiliary_loss_mlp": 0.01085729, + "balance_loss_clip": 1.03433323, + "balance_loss_mlp": 1.00500035, + "epoch": 0.3782841339505802, + "flos": 14647604158080.0, + "grad_norm": 1.8609503131855427, + "language_loss": 0.83392519, + "learning_rate": 2.857024201976089e-06, + "loss": 0.85618591, + "num_input_tokens_seen": 67593515, + "step": 3146, + "time_per_iteration": 2.7571098804473877 + }, + { + "auxiliary_loss_clip": 0.01120377, + "auxiliary_loss_mlp": 0.01085986, + "balance_loss_clip": 1.03099728, + "balance_loss_mlp": 1.00487566, + "epoch": 0.37840437684121925, + "flos": 32818223185920.0, + "grad_norm": 1.9418948657008255, + "language_loss": 0.72920728, + "learning_rate": 2.8563203080298516e-06, + "loss": 0.75127089, + "num_input_tokens_seen": 67614290, + "step": 3147, + "time_per_iteration": 3.8379712104797363 + }, + { + "auxiliary_loss_clip": 0.01129413, + "auxiliary_loss_mlp": 0.00873409, + "balance_loss_clip": 1.03360784, + "balance_loss_mlp": 1.00038159, + "epoch": 0.37852461973185836, + "flos": 18369206346240.0, + "grad_norm": 2.695321602971798, + "language_loss": 0.89435965, + "learning_rate": 2.855616284180505e-06, + "loss": 0.91438788, + "num_input_tokens_seen": 67631340, + "step": 3148, + "time_per_iteration": 3.694003105163574 + }, + { + "auxiliary_loss_clip": 0.01132793, + "auxiliary_loss_mlp": 0.01079554, + "balance_loss_clip": 1.04202199, + "balance_loss_mlp": 1.00020826, + "epoch": 0.37864486262249747, + "flos": 59500680117120.0, + "grad_norm": 0.8798716924933284, + "language_loss": 0.66148621, + "learning_rate": 2.8549121305348477e-06, + "loss": 0.68360972, + "num_input_tokens_seen": 67691125, + "step": 3149, + "time_per_iteration": 3.3253238201141357 + }, + { + "auxiliary_loss_clip": 0.01141214, + "auxiliary_loss_mlp": 0.01085306, + "balance_loss_clip": 1.03533638, + "balance_loss_mlp": 1.00467253, + "epoch": 0.3787651055131365, + "flos": 23363015414400.0, + "grad_norm": 2.1426542059221183, + "language_loss": 0.83356309, + "learning_rate": 2.8542078471997006e-06, + "loss": 0.85582829, + "num_input_tokens_seen": 67708740, + "step": 3150, + "time_per_iteration": 2.685237407684326 + }, + { + "auxiliary_loss_clip": 0.0113894, + "auxiliary_loss_mlp": 0.01085784, + "balance_loss_clip": 1.03349829, + "balance_loss_mlp": 1.00505555, + "epoch": 0.37888534840377563, + "flos": 24601394661120.0, + "grad_norm": 4.62948345091832, + "language_loss": 0.75759029, + "learning_rate": 2.8535034342819013e-06, + "loss": 0.77983749, + "num_input_tokens_seen": 67726150, + "step": 3151, + "time_per_iteration": 2.690199851989746 + }, + { + "auxiliary_loss_clip": 0.01149981, + "auxiliary_loss_mlp": 0.01086682, + "balance_loss_clip": 1.03616834, + "balance_loss_mlp": 1.00600147, + "epoch": 0.37900559129441475, + "flos": 23986891762560.0, + "grad_norm": 1.5532745558485246, + "language_loss": 0.7230258, + "learning_rate": 2.85279889188831e-06, + "loss": 0.7453925, + "num_input_tokens_seen": 67746525, + "step": 3152, + "time_per_iteration": 2.6699087619781494 + }, + { + "auxiliary_loss_clip": 0.01123042, + "auxiliary_loss_mlp": 0.01087219, + "balance_loss_clip": 1.0332545, + "balance_loss_mlp": 1.0063473, + "epoch": 0.3791258341850538, + "flos": 24644667571200.0, + "grad_norm": 2.8016110053889762, + "language_loss": 0.81297004, + "learning_rate": 2.852094220125805e-06, + "loss": 0.83507264, + "num_input_tokens_seen": 67766035, + "step": 3153, + "time_per_iteration": 2.855882167816162 + }, + { + "auxiliary_loss_clip": 0.0113954, + "auxiliary_loss_mlp": 0.01086172, + "balance_loss_clip": 1.03445697, + "balance_loss_mlp": 1.00530088, + "epoch": 0.3792460770756929, + "flos": 17420841509760.0, + "grad_norm": 3.8821452977793967, + "language_loss": 0.71272707, + "learning_rate": 2.8513894191012846e-06, + "loss": 0.73498416, + "num_input_tokens_seen": 67785015, + "step": 3154, + "time_per_iteration": 2.6647441387176514 + }, + { + "auxiliary_loss_clip": 0.01149658, + "auxiliary_loss_mlp": 0.01085375, + "balance_loss_clip": 1.0352546, + "balance_loss_mlp": 1.00459933, + "epoch": 0.37936631996633197, + "flos": 24206557386240.0, + "grad_norm": 1.5369924911967692, + "language_loss": 0.78789747, + "learning_rate": 2.8506844889216664e-06, + "loss": 0.81024778, + "num_input_tokens_seen": 67804400, + "step": 3155, + "time_per_iteration": 2.6619155406951904 + }, + { + "auxiliary_loss_clip": 0.01126263, + "auxiliary_loss_mlp": 0.01079785, + "balance_loss_clip": 1.03540039, + "balance_loss_mlp": 1.00043964, + "epoch": 0.3794865628569711, + "flos": 70297114752000.0, + "grad_norm": 0.8721595339534158, + "language_loss": 0.62828875, + "learning_rate": 2.849979429693887e-06, + "loss": 0.65034926, + "num_input_tokens_seen": 67865385, + "step": 3156, + "time_per_iteration": 3.3076865673065186 + }, + { + "auxiliary_loss_clip": 0.01149397, + "auxiliary_loss_mlp": 0.01086516, + "balance_loss_clip": 1.03572631, + "balance_loss_mlp": 1.00550163, + "epoch": 0.3796068057476102, + "flos": 15779364860160.0, + "grad_norm": 1.9301413962130183, + "language_loss": 0.74130523, + "learning_rate": 2.8492742415249042e-06, + "loss": 0.76366436, + "num_input_tokens_seen": 67883030, + "step": 3157, + "time_per_iteration": 2.635627031326294 + }, + { + "auxiliary_loss_clip": 0.01148741, + "auxiliary_loss_mlp": 0.01086024, + "balance_loss_clip": 1.0351702, + "balance_loss_mlp": 1.00515199, + "epoch": 0.37972704863824924, + "flos": 25191694771200.0, + "grad_norm": 1.5701636114256663, + "language_loss": 0.76274312, + "learning_rate": 2.848568924521694e-06, + "loss": 0.78509074, + "num_input_tokens_seen": 67903810, + "step": 3158, + "time_per_iteration": 2.7503135204315186 + }, + { + "auxiliary_loss_clip": 0.01139505, + "auxiliary_loss_mlp": 0.01085233, + "balance_loss_clip": 1.03348517, + "balance_loss_mlp": 1.00440955, + "epoch": 0.37984729152888835, + "flos": 26210372480640.0, + "grad_norm": 1.7133172281830862, + "language_loss": 0.73449421, + "learning_rate": 2.8478634787912526e-06, + "loss": 0.75674158, + "num_input_tokens_seen": 67921865, + "step": 3159, + "time_per_iteration": 2.7485270500183105 + }, + { + "auxiliary_loss_clip": 0.01141307, + "auxiliary_loss_mlp": 0.0108718, + "balance_loss_clip": 1.0354085, + "balance_loss_mlp": 1.00611734, + "epoch": 0.37996753441952746, + "flos": 25629302165760.0, + "grad_norm": 2.1450430093314323, + "language_loss": 0.76515293, + "learning_rate": 2.847157904440596e-06, + "loss": 0.7874378, + "num_input_tokens_seen": 67941595, + "step": 3160, + "time_per_iteration": 2.7903740406036377 + }, + { + "auxiliary_loss_clip": 0.01138414, + "auxiliary_loss_mlp": 0.01085964, + "balance_loss_clip": 1.03350246, + "balance_loss_mlp": 1.00509286, + "epoch": 0.3800877773101665, + "flos": 20118414862080.0, + "grad_norm": 1.5044889215865025, + "language_loss": 0.73624772, + "learning_rate": 2.846452201576759e-06, + "loss": 0.7584914, + "num_input_tokens_seen": 67960970, + "step": 3161, + "time_per_iteration": 2.6840996742248535 + }, + { + "auxiliary_loss_clip": 0.01121866, + "auxiliary_loss_mlp": 0.01079372, + "balance_loss_clip": 1.03914738, + "balance_loss_mlp": 1.00002599, + "epoch": 0.38020802020080563, + "flos": 63053608037760.0, + "grad_norm": 0.8487837665389707, + "language_loss": 0.62840629, + "learning_rate": 2.845746370306795e-06, + "loss": 0.6504187, + "num_input_tokens_seen": 68026160, + "step": 3162, + "time_per_iteration": 3.410862445831299 + }, + { + "auxiliary_loss_clip": 0.0113951, + "auxiliary_loss_mlp": 0.0108593, + "balance_loss_clip": 1.03391266, + "balance_loss_mlp": 1.00505829, + "epoch": 0.38032826309144474, + "flos": 21288420570240.0, + "grad_norm": 1.8243062073318674, + "language_loss": 0.7823422, + "learning_rate": 2.84504041073778e-06, + "loss": 0.8045966, + "num_input_tokens_seen": 68044575, + "step": 3163, + "time_per_iteration": 2.720343589782715 + }, + { + "auxiliary_loss_clip": 0.01129613, + "auxiliary_loss_mlp": 0.01086813, + "balance_loss_clip": 1.0334723, + "balance_loss_mlp": 1.00579858, + "epoch": 0.3804485059820838, + "flos": 18954119416320.0, + "grad_norm": 1.6040154904055628, + "language_loss": 0.78612769, + "learning_rate": 2.844334322976806e-06, + "loss": 0.80829191, + "num_input_tokens_seen": 68064790, + "step": 3164, + "time_per_iteration": 3.5795907974243164 + }, + { + "auxiliary_loss_clip": 0.01104623, + "auxiliary_loss_mlp": 0.01086759, + "balance_loss_clip": 1.029284, + "balance_loss_mlp": 1.00569677, + "epoch": 0.3805687488727229, + "flos": 21833759831040.0, + "grad_norm": 4.017793333361282, + "language_loss": 0.83462048, + "learning_rate": 2.8436281071309866e-06, + "loss": 0.85653424, + "num_input_tokens_seen": 68083330, + "step": 3165, + "time_per_iteration": 2.8605844974517822 + }, + { + "auxiliary_loss_clip": 0.0110513, + "auxiliary_loss_mlp": 0.01079384, + "balance_loss_clip": 1.0382092, + "balance_loss_mlp": 1.00003815, + "epoch": 0.380688991763362, + "flos": 58546209968640.0, + "grad_norm": 0.7314173184040685, + "language_loss": 0.53004611, + "learning_rate": 2.842921763307455e-06, + "loss": 0.55189121, + "num_input_tokens_seen": 68146140, + "step": 3166, + "time_per_iteration": 3.3631203174591064 + }, + { + "auxiliary_loss_clip": 0.01132634, + "auxiliary_loss_mlp": 0.01085456, + "balance_loss_clip": 1.03499234, + "balance_loss_mlp": 1.004632, + "epoch": 0.38080923465400107, + "flos": 23799509487360.0, + "grad_norm": 1.8620412163047693, + "language_loss": 0.82051289, + "learning_rate": 2.842215291613361e-06, + "loss": 0.84269381, + "num_input_tokens_seen": 68164520, + "step": 3167, + "time_per_iteration": 2.742111921310425 + }, + { + "auxiliary_loss_clip": 0.01066005, + "auxiliary_loss_mlp": 0.01080645, + "balance_loss_clip": 1.02646863, + "balance_loss_mlp": 1.00129879, + "epoch": 0.3809294775446402, + "flos": 54969866380800.0, + "grad_norm": 0.7775119187256407, + "language_loss": 0.59236437, + "learning_rate": 2.8415086921558774e-06, + "loss": 0.61383086, + "num_input_tokens_seen": 68227945, + "step": 3168, + "time_per_iteration": 4.2818763256073 + }, + { + "auxiliary_loss_clip": 0.01130316, + "auxiliary_loss_mlp": 0.01085351, + "balance_loss_clip": 1.03256249, + "balance_loss_mlp": 1.00452709, + "epoch": 0.38104972043527924, + "flos": 24643697904000.0, + "grad_norm": 1.4515419138907684, + "language_loss": 0.78609717, + "learning_rate": 2.840801965042194e-06, + "loss": 0.80825388, + "num_input_tokens_seen": 68247405, + "step": 3169, + "time_per_iteration": 2.8217697143554688 + }, + { + "auxiliary_loss_clip": 0.01133039, + "auxiliary_loss_mlp": 0.0108731, + "balance_loss_clip": 1.0348804, + "balance_loss_mlp": 1.00629544, + "epoch": 0.38116996332591835, + "flos": 22856783086080.0, + "grad_norm": 2.525141879287519, + "language_loss": 0.84007943, + "learning_rate": 2.840095110379521e-06, + "loss": 0.86228287, + "num_input_tokens_seen": 68266925, + "step": 3170, + "time_per_iteration": 2.727206230163574 + }, + { + "auxiliary_loss_clip": 0.01102663, + "auxiliary_loss_mlp": 0.01079616, + "balance_loss_clip": 1.03733516, + "balance_loss_mlp": 1.00027037, + "epoch": 0.38129020621655746, + "flos": 60836160804480.0, + "grad_norm": 0.73465956901019, + "language_loss": 0.53931993, + "learning_rate": 2.8393881282750884e-06, + "loss": 0.56114274, + "num_input_tokens_seen": 68329755, + "step": 3171, + "time_per_iteration": 4.277172803878784 + }, + { + "auxiliary_loss_clip": 0.01120683, + "auxiliary_loss_mlp": 0.01086699, + "balance_loss_clip": 1.03034782, + "balance_loss_mlp": 1.00578022, + "epoch": 0.3814104491071965, + "flos": 21648101408640.0, + "grad_norm": 1.8181323626485626, + "language_loss": 0.78111768, + "learning_rate": 2.838681018836144e-06, + "loss": 0.80319148, + "num_input_tokens_seen": 68347075, + "step": 3172, + "time_per_iteration": 2.7823784351348877 + }, + { + "auxiliary_loss_clip": 0.01122029, + "auxiliary_loss_mlp": 0.00873399, + "balance_loss_clip": 1.03278172, + "balance_loss_mlp": 1.00045109, + "epoch": 0.3815306919978356, + "flos": 19099090707840.0, + "grad_norm": 1.7782794438324354, + "language_loss": 0.78065825, + "learning_rate": 2.837973782169955e-06, + "loss": 0.80061251, + "num_input_tokens_seen": 68365450, + "step": 3173, + "time_per_iteration": 3.6464157104492188 + }, + { + "auxiliary_loss_clip": 0.0113661, + "auxiliary_loss_mlp": 0.01079468, + "balance_loss_clip": 1.03797877, + "balance_loss_mlp": 1.00012207, + "epoch": 0.38165093488847474, + "flos": 67067918156160.0, + "grad_norm": 0.8081361236894095, + "language_loss": 0.59136391, + "learning_rate": 2.8372664183838096e-06, + "loss": 0.61352473, + "num_input_tokens_seen": 68428470, + "step": 3174, + "time_per_iteration": 3.296400308609009 + }, + { + "auxiliary_loss_clip": 0.01147633, + "auxiliary_loss_mlp": 0.01084597, + "balance_loss_clip": 1.03415751, + "balance_loss_mlp": 1.00377274, + "epoch": 0.3817711777791138, + "flos": 22341105480960.0, + "grad_norm": 1.9440629035591632, + "language_loss": 0.67769051, + "learning_rate": 2.836558927585015e-06, + "loss": 0.7000128, + "num_input_tokens_seen": 68445440, + "step": 3175, + "time_per_iteration": 2.6725635528564453 + }, + { + "auxiliary_loss_clip": 0.01138784, + "auxiliary_loss_mlp": 0.01085838, + "balance_loss_clip": 1.03279793, + "balance_loss_mlp": 1.00491858, + "epoch": 0.3818914206697529, + "flos": 22820621068800.0, + "grad_norm": 2.24563825553084, + "language_loss": 0.82822573, + "learning_rate": 2.8358513098808957e-06, + "loss": 0.85047191, + "num_input_tokens_seen": 68465755, + "step": 3176, + "time_per_iteration": 2.699117422103882 + }, + { + "auxiliary_loss_clip": 0.01106624, + "auxiliary_loss_mlp": 0.01085732, + "balance_loss_clip": 1.02981925, + "balance_loss_mlp": 1.00495601, + "epoch": 0.382011663560392, + "flos": 24386074583040.0, + "grad_norm": 2.414022664518783, + "language_loss": 0.76820576, + "learning_rate": 2.835143565378798e-06, + "loss": 0.7901293, + "num_input_tokens_seen": 68486220, + "step": 3177, + "time_per_iteration": 2.867187261581421 + }, + { + "auxiliary_loss_clip": 0.01098919, + "auxiliary_loss_mlp": 0.01085399, + "balance_loss_clip": 1.03028154, + "balance_loss_mlp": 1.00447941, + "epoch": 0.38213190645103107, + "flos": 21981568296960.0, + "grad_norm": 2.116552639149261, + "language_loss": 0.78339016, + "learning_rate": 2.8344356941860847e-06, + "loss": 0.80523336, + "num_input_tokens_seen": 68505850, + "step": 3178, + "time_per_iteration": 2.8288326263427734 + }, + { + "auxiliary_loss_clip": 0.01121439, + "auxiliary_loss_mlp": 0.01085072, + "balance_loss_clip": 1.03315759, + "balance_loss_mlp": 1.0042479, + "epoch": 0.3822521493416702, + "flos": 35516945773440.0, + "grad_norm": 1.9899982088899717, + "language_loss": 0.65947855, + "learning_rate": 2.8337276964101403e-06, + "loss": 0.68154371, + "num_input_tokens_seen": 68526290, + "step": 3179, + "time_per_iteration": 2.9443724155426025 + }, + { + "auxiliary_loss_clip": 0.01139071, + "auxiliary_loss_mlp": 0.01085272, + "balance_loss_clip": 1.03351426, + "balance_loss_mlp": 1.00444818, + "epoch": 0.3823723922323093, + "flos": 21069904181760.0, + "grad_norm": 1.8389531352116215, + "language_loss": 0.76296949, + "learning_rate": 2.833019572158367e-06, + "loss": 0.78521299, + "num_input_tokens_seen": 68544725, + "step": 3180, + "time_per_iteration": 2.719153642654419 + }, + { + "auxiliary_loss_clip": 0.01128033, + "auxiliary_loss_mlp": 0.01086368, + "balance_loss_clip": 1.03317118, + "balance_loss_mlp": 1.00559223, + "epoch": 0.38249263512294834, + "flos": 19789149864960.0, + "grad_norm": 1.9008240066423334, + "language_loss": 0.8019824, + "learning_rate": 2.8323113215381872e-06, + "loss": 0.82412642, + "num_input_tokens_seen": 68563070, + "step": 3181, + "time_per_iteration": 2.7554142475128174 + }, + { + "auxiliary_loss_clip": 0.01118396, + "auxiliary_loss_mlp": 0.01085921, + "balance_loss_clip": 1.03128529, + "balance_loss_mlp": 1.00490618, + "epoch": 0.38261287801358745, + "flos": 21433930565760.0, + "grad_norm": 1.918680940470014, + "language_loss": 0.76460588, + "learning_rate": 2.831602944657042e-06, + "loss": 0.78664911, + "num_input_tokens_seen": 68581150, + "step": 3182, + "time_per_iteration": 2.789573907852173 + }, + { + "auxiliary_loss_clip": 0.01117034, + "auxiliary_loss_mlp": 0.01086467, + "balance_loss_clip": 1.03438115, + "balance_loss_mlp": 1.00564313, + "epoch": 0.38273312090422656, + "flos": 21981568296960.0, + "grad_norm": 2.4833842219521296, + "language_loss": 0.74226505, + "learning_rate": 2.830894441622391e-06, + "loss": 0.76430011, + "num_input_tokens_seen": 68597800, + "step": 3183, + "time_per_iteration": 2.711097478866577 + }, + { + "auxiliary_loss_clip": 0.01122202, + "auxiliary_loss_mlp": 0.00873415, + "balance_loss_clip": 1.03287768, + "balance_loss_mlp": 1.0004096, + "epoch": 0.3828533637948656, + "flos": 24790895838720.0, + "grad_norm": 1.7602760388606506, + "language_loss": 0.80062222, + "learning_rate": 2.8301858125417134e-06, + "loss": 0.8205784, + "num_input_tokens_seen": 68617640, + "step": 3184, + "time_per_iteration": 2.8155689239501953 + }, + { + "auxiliary_loss_clip": 0.01126307, + "auxiliary_loss_mlp": 0.01085432, + "balance_loss_clip": 1.03155661, + "balance_loss_mlp": 1.00470304, + "epoch": 0.38297360668550473, + "flos": 22455445449600.0, + "grad_norm": 1.6510859497154007, + "language_loss": 0.73841989, + "learning_rate": 2.8294770575225082e-06, + "loss": 0.76053733, + "num_input_tokens_seen": 68637770, + "step": 3185, + "time_per_iteration": 2.734605550765991 + }, + { + "auxiliary_loss_clip": 0.01138825, + "auxiliary_loss_mlp": 0.01087172, + "balance_loss_clip": 1.03446662, + "balance_loss_mlp": 1.00625277, + "epoch": 0.3830938495761438, + "flos": 24896903852160.0, + "grad_norm": 1.6398849746518174, + "language_loss": 0.83612609, + "learning_rate": 2.828768176672293e-06, + "loss": 0.85838604, + "num_input_tokens_seen": 68656885, + "step": 3186, + "time_per_iteration": 2.7642126083374023 + }, + { + "auxiliary_loss_clip": 0.01120974, + "auxiliary_loss_mlp": 0.01086024, + "balance_loss_clip": 1.03165996, + "balance_loss_mlp": 1.00505745, + "epoch": 0.3832140924667829, + "flos": 33036236784000.0, + "grad_norm": 1.5201771587711166, + "language_loss": 0.71617693, + "learning_rate": 2.8280591700986044e-06, + "loss": 0.73824686, + "num_input_tokens_seen": 68678750, + "step": 3187, + "time_per_iteration": 2.861574649810791 + }, + { + "auxiliary_loss_clip": 0.01130498, + "auxiliary_loss_mlp": 0.01084757, + "balance_loss_clip": 1.03285933, + "balance_loss_mlp": 1.00402808, + "epoch": 0.383334335357422, + "flos": 31903721896320.0, + "grad_norm": 1.6312707877551624, + "language_loss": 0.74650347, + "learning_rate": 2.827350037908999e-06, + "loss": 0.76865602, + "num_input_tokens_seen": 68698190, + "step": 3188, + "time_per_iteration": 2.8597683906555176 + }, + { + "auxiliary_loss_clip": 0.01118056, + "auxiliary_loss_mlp": 0.01086333, + "balance_loss_clip": 1.03075004, + "balance_loss_mlp": 1.00536585, + "epoch": 0.38345457824806106, + "flos": 19791915212160.0, + "grad_norm": 1.9694404987112017, + "language_loss": 0.78636056, + "learning_rate": 2.8266407802110496e-06, + "loss": 0.80840445, + "num_input_tokens_seen": 68716445, + "step": 3189, + "time_per_iteration": 3.6397106647491455 + }, + { + "auxiliary_loss_clip": 0.01092993, + "auxiliary_loss_mlp": 0.01086314, + "balance_loss_clip": 1.03073978, + "balance_loss_mlp": 1.00534678, + "epoch": 0.3835748211387002, + "flos": 22419391173120.0, + "grad_norm": 1.9347049592156114, + "language_loss": 0.75919759, + "learning_rate": 2.8259313971123515e-06, + "loss": 0.78099072, + "num_input_tokens_seen": 68737565, + "step": 3190, + "time_per_iteration": 2.906135082244873 + }, + { + "auxiliary_loss_clip": 0.01137003, + "auxiliary_loss_mlp": 0.01085869, + "balance_loss_clip": 1.03314173, + "balance_loss_mlp": 1.00523567, + "epoch": 0.3836950640293393, + "flos": 25118436983040.0, + "grad_norm": 1.5764509665526285, + "language_loss": 0.78389877, + "learning_rate": 2.8252218887205166e-06, + "loss": 0.80612743, + "num_input_tokens_seen": 68758255, + "step": 3191, + "time_per_iteration": 2.8340959548950195 + }, + { + "auxiliary_loss_clip": 0.01099596, + "auxiliary_loss_mlp": 0.01086374, + "balance_loss_clip": 1.02869606, + "balance_loss_mlp": 1.0054543, + "epoch": 0.38381530691997834, + "flos": 21799213925760.0, + "grad_norm": 1.7683290632889823, + "language_loss": 0.80687153, + "learning_rate": 2.824512255143178e-06, + "loss": 0.8287313, + "num_input_tokens_seen": 68777490, + "step": 3192, + "time_per_iteration": 2.8308489322662354 + }, + { + "auxiliary_loss_clip": 0.01115318, + "auxiliary_loss_mlp": 0.01085333, + "balance_loss_clip": 1.03209853, + "balance_loss_mlp": 1.00455642, + "epoch": 0.38393554981061745, + "flos": 21252689516160.0, + "grad_norm": 2.0431578297280275, + "language_loss": 0.79347849, + "learning_rate": 2.8238024964879855e-06, + "loss": 0.815485, + "num_input_tokens_seen": 68798385, + "step": 3193, + "time_per_iteration": 3.642449140548706 + }, + { + "auxiliary_loss_clip": 0.01149941, + "auxiliary_loss_mlp": 0.0108674, + "balance_loss_clip": 1.0356766, + "balance_loss_mlp": 1.00577259, + "epoch": 0.38405579270125656, + "flos": 17019360218880.0, + "grad_norm": 2.0859924021194445, + "language_loss": 0.76916885, + "learning_rate": 2.8230926128626095e-06, + "loss": 0.79153562, + "num_input_tokens_seen": 68816880, + "step": 3194, + "time_per_iteration": 2.6823830604553223 + }, + { + "auxiliary_loss_clip": 0.01129037, + "auxiliary_loss_mlp": 0.01085644, + "balance_loss_clip": 1.03257632, + "balance_loss_mlp": 1.00486815, + "epoch": 0.3841760355918956, + "flos": 21835375943040.0, + "grad_norm": 1.9400504353031491, + "language_loss": 0.79061645, + "learning_rate": 2.822382604374738e-06, + "loss": 0.81276321, + "num_input_tokens_seen": 68835805, + "step": 3195, + "time_per_iteration": 2.7220304012298584 + }, + { + "auxiliary_loss_clip": 0.01128826, + "auxiliary_loss_mlp": 0.01085517, + "balance_loss_clip": 1.03320837, + "balance_loss_mlp": 1.00474095, + "epoch": 0.3842962784825347, + "flos": 25915114684800.0, + "grad_norm": 2.1432057638927686, + "language_loss": 0.65837431, + "learning_rate": 2.8216724711320793e-06, + "loss": 0.68051779, + "num_input_tokens_seen": 68854930, + "step": 3196, + "time_per_iteration": 2.7628703117370605 + }, + { + "auxiliary_loss_clip": 0.01149051, + "auxiliary_loss_mlp": 0.00873112, + "balance_loss_clip": 1.03515697, + "balance_loss_mlp": 1.00031257, + "epoch": 0.38441652137317384, + "flos": 25337492075520.0, + "grad_norm": 1.5015418614256197, + "language_loss": 0.79614639, + "learning_rate": 2.820962213242361e-06, + "loss": 0.81636798, + "num_input_tokens_seen": 68874260, + "step": 3197, + "time_per_iteration": 3.684422254562378 + }, + { + "auxiliary_loss_clip": 0.01133486, + "auxiliary_loss_mlp": 0.01084928, + "balance_loss_clip": 1.03113401, + "balance_loss_mlp": 1.00419903, + "epoch": 0.3845367642638129, + "flos": 18113486446080.0, + "grad_norm": 2.2334291275127023, + "language_loss": 0.84248459, + "learning_rate": 2.8202518308133264e-06, + "loss": 0.86466873, + "num_input_tokens_seen": 68891535, + "step": 3198, + "time_per_iteration": 3.6020665168762207 + }, + { + "auxiliary_loss_clip": 0.01148156, + "auxiliary_loss_mlp": 0.01085656, + "balance_loss_clip": 1.03430545, + "balance_loss_mlp": 1.00483239, + "epoch": 0.384657007154452, + "flos": 25228395492480.0, + "grad_norm": 2.0778381181809724, + "language_loss": 0.7318095, + "learning_rate": 2.8195413239527426e-06, + "loss": 0.75414765, + "num_input_tokens_seen": 68911275, + "step": 3199, + "time_per_iteration": 2.6981964111328125 + }, + { + "auxiliary_loss_clip": 0.01140126, + "auxiliary_loss_mlp": 0.01087747, + "balance_loss_clip": 1.03387737, + "balance_loss_mlp": 1.00687504, + "epoch": 0.38477725004509106, + "flos": 19865855358720.0, + "grad_norm": 1.9937146906195007, + "language_loss": 0.80900264, + "learning_rate": 2.8188306927683906e-06, + "loss": 0.83128136, + "num_input_tokens_seen": 68930745, + "step": 3200, + "time_per_iteration": 2.674647092819214 + }, + { + "auxiliary_loss_clip": 0.01129107, + "auxiliary_loss_mlp": 0.01085952, + "balance_loss_clip": 1.03350842, + "balance_loss_mlp": 1.00512815, + "epoch": 0.38489749293573017, + "flos": 18259391491200.0, + "grad_norm": 2.202210888707017, + "language_loss": 0.7479192, + "learning_rate": 2.818119937368074e-06, + "loss": 0.77006984, + "num_input_tokens_seen": 68949380, + "step": 3201, + "time_per_iteration": 2.772857189178467 + }, + { + "auxiliary_loss_clip": 0.01141241, + "auxiliary_loss_mlp": 0.01086478, + "balance_loss_clip": 1.03481531, + "balance_loss_mlp": 1.00541532, + "epoch": 0.3850177358263693, + "flos": 24389163152640.0, + "grad_norm": 2.032586436748439, + "language_loss": 0.65740776, + "learning_rate": 2.817409057859613e-06, + "loss": 0.679685, + "num_input_tokens_seen": 68968370, + "step": 3202, + "time_per_iteration": 2.7633326053619385 + }, + { + "auxiliary_loss_clip": 0.01100008, + "auxiliary_loss_mlp": 0.0108712, + "balance_loss_clip": 1.02800417, + "balance_loss_mlp": 1.00610566, + "epoch": 0.38513797871700833, + "flos": 17671533505920.0, + "grad_norm": 5.257428259492081, + "language_loss": 0.79135489, + "learning_rate": 2.8166980543508482e-06, + "loss": 0.81322622, + "num_input_tokens_seen": 68984260, + "step": 3203, + "time_per_iteration": 2.7946207523345947 + }, + { + "auxiliary_loss_clip": 0.01150518, + "auxiliary_loss_mlp": 0.01085584, + "balance_loss_clip": 1.03657794, + "balance_loss_mlp": 1.00466466, + "epoch": 0.38525822160764744, + "flos": 25739583897600.0, + "grad_norm": 1.950708009960317, + "language_loss": 0.79753196, + "learning_rate": 2.815986926949638e-06, + "loss": 0.81989294, + "num_input_tokens_seen": 69002760, + "step": 3204, + "time_per_iteration": 2.6626811027526855 + }, + { + "auxiliary_loss_clip": 0.01137891, + "auxiliary_loss_mlp": 0.0108554, + "balance_loss_clip": 1.03357625, + "balance_loss_mlp": 1.00471592, + "epoch": 0.38537846449828655, + "flos": 20193647898240.0, + "grad_norm": 1.937620841970193, + "language_loss": 0.80253059, + "learning_rate": 2.8152756757638597e-06, + "loss": 0.82476485, + "num_input_tokens_seen": 69021260, + "step": 3205, + "time_per_iteration": 2.6796586513519287 + }, + { + "auxiliary_loss_clip": 0.01131567, + "auxiliary_loss_mlp": 0.01085929, + "balance_loss_clip": 1.03279257, + "balance_loss_mlp": 1.00496244, + "epoch": 0.3854987073889256, + "flos": 23039352938880.0, + "grad_norm": 1.998699236060387, + "language_loss": 0.84547871, + "learning_rate": 2.8145643009014093e-06, + "loss": 0.86765367, + "num_input_tokens_seen": 69039755, + "step": 3206, + "time_per_iteration": 2.659658670425415 + }, + { + "auxiliary_loss_clip": 0.01140668, + "auxiliary_loss_mlp": 0.01085581, + "balance_loss_clip": 1.03546989, + "balance_loss_mlp": 1.00480473, + "epoch": 0.3856189502795647, + "flos": 20190631155840.0, + "grad_norm": 1.8629079829834834, + "language_loss": 0.78975677, + "learning_rate": 2.813852802470202e-06, + "loss": 0.81201929, + "num_input_tokens_seen": 69057650, + "step": 3207, + "time_per_iteration": 2.7259180545806885 + }, + { + "auxiliary_loss_clip": 0.0113301, + "auxiliary_loss_mlp": 0.01085642, + "balance_loss_clip": 1.03592646, + "balance_loss_mlp": 1.00477064, + "epoch": 0.38573919317020383, + "flos": 25702631781120.0, + "grad_norm": 1.9153797355816304, + "language_loss": 0.71977532, + "learning_rate": 2.8131411805781717e-06, + "loss": 0.74196184, + "num_input_tokens_seen": 69077775, + "step": 3208, + "time_per_iteration": 2.726139783859253 + }, + { + "auxiliary_loss_clip": 0.01129075, + "auxiliary_loss_mlp": 0.01086404, + "balance_loss_clip": 1.03414536, + "balance_loss_mlp": 1.00534201, + "epoch": 0.3858594360608429, + "flos": 29821405628160.0, + "grad_norm": 2.6639446259432504, + "language_loss": 0.63936257, + "learning_rate": 2.8124294353332707e-06, + "loss": 0.66151738, + "num_input_tokens_seen": 69096450, + "step": 3209, + "time_per_iteration": 2.77603816986084 + }, + { + "auxiliary_loss_clip": 0.01121274, + "auxiliary_loss_mlp": 0.01085392, + "balance_loss_clip": 1.03262401, + "balance_loss_mlp": 1.00466323, + "epoch": 0.385979678951482, + "flos": 24790428961920.0, + "grad_norm": 1.6772099684791975, + "language_loss": 0.77078056, + "learning_rate": 2.8117175668434713e-06, + "loss": 0.79284728, + "num_input_tokens_seen": 69116110, + "step": 3210, + "time_per_iteration": 2.8128440380096436 + }, + { + "auxiliary_loss_clip": 0.01149268, + "auxiliary_loss_mlp": 0.01086551, + "balance_loss_clip": 1.03569841, + "balance_loss_mlp": 1.00553584, + "epoch": 0.3860999218421211, + "flos": 21287881866240.0, + "grad_norm": 2.1806400621662045, + "language_loss": 0.70869446, + "learning_rate": 2.811005575216762e-06, + "loss": 0.7310527, + "num_input_tokens_seen": 69134825, + "step": 3211, + "time_per_iteration": 2.6419286727905273 + }, + { + "auxiliary_loss_clip": 0.0111453, + "auxiliary_loss_mlp": 0.01085918, + "balance_loss_clip": 1.02822638, + "balance_loss_mlp": 1.00514221, + "epoch": 0.38622016473276016, + "flos": 24536720223360.0, + "grad_norm": 1.3972548436433772, + "language_loss": 0.79043388, + "learning_rate": 2.8102934605611513e-06, + "loss": 0.81243831, + "num_input_tokens_seen": 69156460, + "step": 3212, + "time_per_iteration": 2.866873264312744 + }, + { + "auxiliary_loss_clip": 0.01132328, + "auxiliary_loss_mlp": 0.01087377, + "balance_loss_clip": 1.0355494, + "balance_loss_mlp": 1.00650501, + "epoch": 0.3863404076233993, + "flos": 20558212986240.0, + "grad_norm": 2.086763815800691, + "language_loss": 0.67260814, + "learning_rate": 2.8095812229846665e-06, + "loss": 0.6948052, + "num_input_tokens_seen": 69176420, + "step": 3213, + "time_per_iteration": 2.886134624481201 + }, + { + "auxiliary_loss_clip": 0.01128381, + "auxiliary_loss_mlp": 0.01085044, + "balance_loss_clip": 1.03283465, + "balance_loss_mlp": 1.00436294, + "epoch": 0.3864606505140384, + "flos": 22346277039360.0, + "grad_norm": 2.3148072806920035, + "language_loss": 0.6908673, + "learning_rate": 2.808868862595355e-06, + "loss": 0.71300155, + "num_input_tokens_seen": 69196665, + "step": 3214, + "time_per_iteration": 3.6661458015441895 + }, + { + "auxiliary_loss_clip": 0.01140014, + "auxiliary_loss_mlp": 0.01086316, + "balance_loss_clip": 1.03448987, + "balance_loss_mlp": 1.0055871, + "epoch": 0.38658089340467744, + "flos": 25703601448320.0, + "grad_norm": 2.215773417884525, + "language_loss": 0.79674697, + "learning_rate": 2.8081563795012795e-06, + "loss": 0.81901026, + "num_input_tokens_seen": 69216290, + "step": 3215, + "time_per_iteration": 2.8711233139038086 + }, + { + "auxiliary_loss_clip": 0.01131565, + "auxiliary_loss_mlp": 0.01084965, + "balance_loss_clip": 1.03383684, + "balance_loss_mlp": 1.00414157, + "epoch": 0.38670113629531655, + "flos": 33802534558080.0, + "grad_norm": 1.8489802487759386, + "language_loss": 0.73698705, + "learning_rate": 2.807443773810524e-06, + "loss": 0.75915235, + "num_input_tokens_seen": 69237550, + "step": 3216, + "time_per_iteration": 2.8210322856903076 + }, + { + "auxiliary_loss_clip": 0.01108296, + "auxiliary_loss_mlp": 0.01087061, + "balance_loss_clip": 1.02856708, + "balance_loss_mlp": 1.00618935, + "epoch": 0.3868213791859556, + "flos": 23331522165120.0, + "grad_norm": 1.9651107668575427, + "language_loss": 0.89360309, + "learning_rate": 2.80673104563119e-06, + "loss": 0.91555661, + "num_input_tokens_seen": 69258175, + "step": 3217, + "time_per_iteration": 2.8331377506256104 + }, + { + "auxiliary_loss_clip": 0.01139409, + "auxiliary_loss_mlp": 0.01085349, + "balance_loss_clip": 1.03532386, + "balance_loss_mlp": 1.00457275, + "epoch": 0.3869416220765947, + "flos": 18441530380800.0, + "grad_norm": 1.7243166900284717, + "language_loss": 0.78822297, + "learning_rate": 2.8060181950713976e-06, + "loss": 0.81047058, + "num_input_tokens_seen": 69274965, + "step": 3218, + "time_per_iteration": 2.668797016143799 + }, + { + "auxiliary_loss_clip": 0.01121691, + "auxiliary_loss_mlp": 0.01085513, + "balance_loss_clip": 1.03310156, + "balance_loss_mlp": 1.00464177, + "epoch": 0.3870618649672338, + "flos": 15632992938240.0, + "grad_norm": 1.910825681987693, + "language_loss": 0.80874413, + "learning_rate": 2.805305222239286e-06, + "loss": 0.83081615, + "num_input_tokens_seen": 69292220, + "step": 3219, + "time_per_iteration": 3.6776680946350098 + }, + { + "auxiliary_loss_clip": 0.01120473, + "auxiliary_loss_mlp": 0.01085419, + "balance_loss_clip": 1.03106189, + "balance_loss_mlp": 1.00459552, + "epoch": 0.3871821078578729, + "flos": 23513804709120.0, + "grad_norm": 2.9518230518857673, + "language_loss": 0.73551011, + "learning_rate": 2.8045921272430118e-06, + "loss": 0.75756907, + "num_input_tokens_seen": 69311900, + "step": 3220, + "time_per_iteration": 2.805753707885742 + }, + { + "auxiliary_loss_clip": 0.01140948, + "auxiliary_loss_mlp": 0.01087877, + "balance_loss_clip": 1.03486943, + "balance_loss_mlp": 1.00686228, + "epoch": 0.387302350748512, + "flos": 17778259791360.0, + "grad_norm": 3.41944261771999, + "language_loss": 0.76913595, + "learning_rate": 2.803878910190753e-06, + "loss": 0.79142416, + "num_input_tokens_seen": 69328820, + "step": 3221, + "time_per_iteration": 2.679943799972534 + }, + { + "auxiliary_loss_clip": 0.01141427, + "auxiliary_loss_mlp": 0.01085983, + "balance_loss_clip": 1.03553283, + "balance_loss_mlp": 1.00511122, + "epoch": 0.3874225936391511, + "flos": 11503409097600.0, + "grad_norm": 2.29876880617122, + "language_loss": 0.82269406, + "learning_rate": 2.8031655711907017e-06, + "loss": 0.8449682, + "num_input_tokens_seen": 69342525, + "step": 3222, + "time_per_iteration": 3.665043592453003 + }, + { + "auxiliary_loss_clip": 0.01137932, + "auxiliary_loss_mlp": 0.01086451, + "balance_loss_clip": 1.03346181, + "balance_loss_mlp": 1.00557947, + "epoch": 0.38754283652979016, + "flos": 21945154884480.0, + "grad_norm": 1.9935694117199199, + "language_loss": 0.80536044, + "learning_rate": 2.8024521103510723e-06, + "loss": 0.82760429, + "num_input_tokens_seen": 69359295, + "step": 3223, + "time_per_iteration": 2.7108230590820312 + }, + { + "auxiliary_loss_clip": 0.01139842, + "auxiliary_loss_mlp": 0.01085013, + "balance_loss_clip": 1.03389931, + "balance_loss_mlp": 1.00418913, + "epoch": 0.38766307942042927, + "flos": 21175984022400.0, + "grad_norm": 1.905215287328899, + "language_loss": 0.74880868, + "learning_rate": 2.8017385277800952e-06, + "loss": 0.77105725, + "num_input_tokens_seen": 69377650, + "step": 3224, + "time_per_iteration": 2.7154338359832764 + }, + { + "auxiliary_loss_clip": 0.01118953, + "auxiliary_loss_mlp": 0.01086691, + "balance_loss_clip": 1.03194833, + "balance_loss_mlp": 1.00577152, + "epoch": 0.3877833223110684, + "flos": 27417294391680.0, + "grad_norm": 2.149206873975183, + "language_loss": 0.75291467, + "learning_rate": 2.8010248235860213e-06, + "loss": 0.77497113, + "num_input_tokens_seen": 69397765, + "step": 3225, + "time_per_iteration": 3.851947069168091 + }, + { + "auxiliary_loss_clip": 0.01121217, + "auxiliary_loss_mlp": 0.00873411, + "balance_loss_clip": 1.03840971, + "balance_loss_mlp": 1.00171793, + "epoch": 0.38790356520170743, + "flos": 64500019879680.0, + "grad_norm": 0.8675576588846502, + "language_loss": 0.62733406, + "learning_rate": 2.8003109978771192e-06, + "loss": 0.64728034, + "num_input_tokens_seen": 69458930, + "step": 3226, + "time_per_iteration": 3.3684282302856445 + }, + { + "auxiliary_loss_clip": 0.0111807, + "auxiliary_loss_mlp": 0.01085835, + "balance_loss_clip": 1.030671, + "balance_loss_mlp": 1.00491571, + "epoch": 0.38802380809234654, + "flos": 22345415112960.0, + "grad_norm": 1.9655305416628477, + "language_loss": 0.78916657, + "learning_rate": 2.799597050761674e-06, + "loss": 0.81120563, + "num_input_tokens_seen": 69475135, + "step": 3227, + "time_per_iteration": 2.8104982376098633 + }, + { + "auxiliary_loss_clip": 0.01149357, + "auxiliary_loss_mlp": 0.01084683, + "balance_loss_clip": 1.03517759, + "balance_loss_mlp": 1.00381112, + "epoch": 0.38814405098298566, + "flos": 25261361199360.0, + "grad_norm": 1.9004983181740327, + "language_loss": 0.79091758, + "learning_rate": 2.7988829823479924e-06, + "loss": 0.81325799, + "num_input_tokens_seen": 69493525, + "step": 3228, + "time_per_iteration": 2.7010791301727295 + }, + { + "auxiliary_loss_clip": 0.0112852, + "auxiliary_loss_mlp": 0.01086017, + "balance_loss_clip": 1.03183484, + "balance_loss_mlp": 1.0050981, + "epoch": 0.3882642938736247, + "flos": 18841180078080.0, + "grad_norm": 5.5086369775727935, + "language_loss": 0.63912737, + "learning_rate": 2.7981687927443976e-06, + "loss": 0.66127276, + "num_input_tokens_seen": 69510325, + "step": 3229, + "time_per_iteration": 2.794196128845215 + }, + { + "auxiliary_loss_clip": 0.01139323, + "auxiliary_loss_mlp": 0.01086868, + "balance_loss_clip": 1.0330832, + "balance_loss_mlp": 1.00609207, + "epoch": 0.3883845367642638, + "flos": 21652806090240.0, + "grad_norm": 2.054803503319616, + "language_loss": 0.85667777, + "learning_rate": 2.797454482059231e-06, + "loss": 0.87893975, + "num_input_tokens_seen": 69530480, + "step": 3230, + "time_per_iteration": 2.7453489303588867 + }, + { + "auxiliary_loss_clip": 0.0114958, + "auxiliary_loss_mlp": 0.01085202, + "balance_loss_clip": 1.03541195, + "balance_loss_mlp": 1.00432992, + "epoch": 0.3885047796549029, + "flos": 20557530627840.0, + "grad_norm": 1.5253727472004812, + "language_loss": 0.84204912, + "learning_rate": 2.7967400504008537e-06, + "loss": 0.86439693, + "num_input_tokens_seen": 69549780, + "step": 3231, + "time_per_iteration": 2.6291067600250244 + }, + { + "auxiliary_loss_clip": 0.01100691, + "auxiliary_loss_mlp": 0.01079633, + "balance_loss_clip": 1.03427839, + "balance_loss_mlp": 1.00028694, + "epoch": 0.388625022545542, + "flos": 64325491695360.0, + "grad_norm": 0.8104092653306738, + "language_loss": 0.57477969, + "learning_rate": 2.7960254978776456e-06, + "loss": 0.59658289, + "num_input_tokens_seen": 69611870, + "step": 3232, + "time_per_iteration": 3.381737232208252 + }, + { + "auxiliary_loss_clip": 0.01150633, + "auxiliary_loss_mlp": 0.01085208, + "balance_loss_clip": 1.0366534, + "balance_loss_mlp": 1.00443196, + "epoch": 0.3887452654361811, + "flos": 18113881495680.0, + "grad_norm": 2.2756128702993386, + "language_loss": 0.81836379, + "learning_rate": 2.7953108245980006e-06, + "loss": 0.8407222, + "num_input_tokens_seen": 69630385, + "step": 3233, + "time_per_iteration": 2.6315152645111084 + }, + { + "auxiliary_loss_clip": 0.01120532, + "auxiliary_loss_mlp": 0.0108537, + "balance_loss_clip": 1.03078365, + "balance_loss_mlp": 1.00468946, + "epoch": 0.38886550832682015, + "flos": 24975261371520.0, + "grad_norm": 1.614151108370548, + "language_loss": 0.73622489, + "learning_rate": 2.7945960306703365e-06, + "loss": 0.75828397, + "num_input_tokens_seen": 69653370, + "step": 3234, + "time_per_iteration": 2.8244950771331787 + }, + { + "auxiliary_loss_clip": 0.01140898, + "auxiliary_loss_mlp": 0.01086093, + "balance_loss_clip": 1.03489852, + "balance_loss_mlp": 1.00526881, + "epoch": 0.38898575121745926, + "flos": 27199496275200.0, + "grad_norm": 1.679330177223985, + "language_loss": 0.65871775, + "learning_rate": 2.7938811162030865e-06, + "loss": 0.68098772, + "num_input_tokens_seen": 69673635, + "step": 3235, + "time_per_iteration": 2.830493211746216 + }, + { + "auxiliary_loss_clip": 0.01138662, + "auxiliary_loss_mlp": 0.01084899, + "balance_loss_clip": 1.03397536, + "balance_loss_mlp": 1.00421786, + "epoch": 0.3891059941080984, + "flos": 28763728727040.0, + "grad_norm": 1.590105213073962, + "language_loss": 0.82186675, + "learning_rate": 2.793166081304702e-06, + "loss": 0.84410238, + "num_input_tokens_seen": 69694130, + "step": 3236, + "time_per_iteration": 2.81543231010437 + }, + { + "auxiliary_loss_clip": 0.01120396, + "auxiliary_loss_mlp": 0.01085583, + "balance_loss_clip": 1.03215361, + "balance_loss_mlp": 1.0046643, + "epoch": 0.38922623699873743, + "flos": 22893447893760.0, + "grad_norm": 1.9839031493479626, + "language_loss": 0.82050282, + "learning_rate": 2.7924509260836543e-06, + "loss": 0.84256256, + "num_input_tokens_seen": 69713255, + "step": 3237, + "time_per_iteration": 2.811413288116455 + }, + { + "auxiliary_loss_clip": 0.01118777, + "auxiliary_loss_mlp": 0.01085843, + "balance_loss_clip": 1.03152001, + "balance_loss_mlp": 1.00501883, + "epoch": 0.38934647988937654, + "flos": 19792418002560.0, + "grad_norm": 1.4603357207856107, + "language_loss": 0.68150651, + "learning_rate": 2.791735650648431e-06, + "loss": 0.70355272, + "num_input_tokens_seen": 69732375, + "step": 3238, + "time_per_iteration": 2.7290971279144287 + }, + { + "auxiliary_loss_clip": 0.01126784, + "auxiliary_loss_mlp": 0.01086004, + "balance_loss_clip": 1.03161025, + "balance_loss_mlp": 1.00517964, + "epoch": 0.38946672278001565, + "flos": 19202081978880.0, + "grad_norm": 2.089027593332551, + "language_loss": 0.74338329, + "learning_rate": 2.791020255107538e-06, + "loss": 0.76551116, + "num_input_tokens_seen": 69749745, + "step": 3239, + "time_per_iteration": 3.7001404762268066 + }, + { + "auxiliary_loss_clip": 0.01121934, + "auxiliary_loss_mlp": 0.01086491, + "balance_loss_clip": 1.03385544, + "balance_loss_mlp": 1.00542903, + "epoch": 0.3895869656706547, + "flos": 24936477661440.0, + "grad_norm": 1.5703362298017274, + "language_loss": 0.80696809, + "learning_rate": 2.7903047395695023e-06, + "loss": 0.82905233, + "num_input_tokens_seen": 69769645, + "step": 3240, + "time_per_iteration": 2.79317307472229 + }, + { + "auxiliary_loss_clip": 0.01138116, + "auxiliary_loss_mlp": 0.00873407, + "balance_loss_clip": 1.03452325, + "balance_loss_mlp": 1.00038624, + "epoch": 0.3897072085612938, + "flos": 24133622820480.0, + "grad_norm": 2.1374628455081623, + "language_loss": 0.90244973, + "learning_rate": 2.789589104142865e-06, + "loss": 0.92256492, + "num_input_tokens_seen": 69787270, + "step": 3241, + "time_per_iteration": 2.7591304779052734 + }, + { + "auxiliary_loss_clip": 0.01115317, + "auxiliary_loss_mlp": 0.01086339, + "balance_loss_clip": 1.02911687, + "balance_loss_mlp": 1.0055151, + "epoch": 0.3898274514519329, + "flos": 17166342672000.0, + "grad_norm": 1.6139914848886552, + "language_loss": 0.76669836, + "learning_rate": 2.7888733489361895e-06, + "loss": 0.78871489, + "num_input_tokens_seen": 69805685, + "step": 3242, + "time_per_iteration": 2.8402316570281982 + }, + { + "auxiliary_loss_clip": 0.01138021, + "auxiliary_loss_mlp": 0.01079471, + "balance_loss_clip": 1.03969383, + "balance_loss_mlp": 1.00012577, + "epoch": 0.389947694342572, + "flos": 66074807952000.0, + "grad_norm": 0.7243554629409209, + "language_loss": 0.58713698, + "learning_rate": 2.788157474058054e-06, + "loss": 0.60931188, + "num_input_tokens_seen": 69867960, + "step": 3243, + "time_per_iteration": 3.3508713245391846 + }, + { + "auxiliary_loss_clip": 0.01147495, + "auxiliary_loss_mlp": 0.01086127, + "balance_loss_clip": 1.03423071, + "balance_loss_mlp": 1.00525522, + "epoch": 0.3900679372332111, + "flos": 25740912700800.0, + "grad_norm": 1.986324978331319, + "language_loss": 0.69834387, + "learning_rate": 2.7874414796170555e-06, + "loss": 0.72068012, + "num_input_tokens_seen": 69889450, + "step": 3244, + "time_per_iteration": 3.606642961502075 + }, + { + "auxiliary_loss_clip": 0.01138009, + "auxiliary_loss_mlp": 0.0108671, + "balance_loss_clip": 1.03276658, + "balance_loss_mlp": 1.0056951, + "epoch": 0.3901881801238502, + "flos": 11801611808640.0, + "grad_norm": 2.4369578489823107, + "language_loss": 0.84588575, + "learning_rate": 2.7867253657218113e-06, + "loss": 0.86813289, + "num_input_tokens_seen": 69903340, + "step": 3245, + "time_per_iteration": 2.6291167736053467 + }, + { + "auxiliary_loss_clip": 0.01128173, + "auxiliary_loss_mlp": 0.00873465, + "balance_loss_clip": 1.0322684, + "balance_loss_mlp": 1.00032759, + "epoch": 0.39030842301448926, + "flos": 27308951994240.0, + "grad_norm": 1.665810965952379, + "language_loss": 0.72983754, + "learning_rate": 2.7860091324809544e-06, + "loss": 0.74985397, + "num_input_tokens_seen": 69924400, + "step": 3246, + "time_per_iteration": 2.8986740112304688 + }, + { + "auxiliary_loss_clip": 0.01136332, + "auxiliary_loss_mlp": 0.01085199, + "balance_loss_clip": 1.03344452, + "balance_loss_mlp": 1.00437498, + "epoch": 0.39042866590512837, + "flos": 27163334257920.0, + "grad_norm": 1.8609297658742843, + "language_loss": 0.81379402, + "learning_rate": 2.7852927800031377e-06, + "loss": 0.83600926, + "num_input_tokens_seen": 69944565, + "step": 3247, + "time_per_iteration": 2.803415298461914 + }, + { + "auxiliary_loss_clip": 0.01128397, + "auxiliary_loss_mlp": 0.01086391, + "balance_loss_clip": 1.03246748, + "balance_loss_mlp": 1.00547194, + "epoch": 0.3905489087957674, + "flos": 29716115886720.0, + "grad_norm": 2.417377642820086, + "language_loss": 0.82812083, + "learning_rate": 2.7845763083970298e-06, + "loss": 0.85026866, + "num_input_tokens_seen": 69964965, + "step": 3248, + "time_per_iteration": 3.8018674850463867 + }, + { + "auxiliary_loss_clip": 0.011394, + "auxiliary_loss_mlp": 0.01086212, + "balance_loss_clip": 1.03379941, + "balance_loss_mlp": 1.00533998, + "epoch": 0.39066915168640653, + "flos": 24498618871680.0, + "grad_norm": 1.8999230367252102, + "language_loss": 0.81893873, + "learning_rate": 2.7838597177713205e-06, + "loss": 0.84119487, + "num_input_tokens_seen": 69986055, + "step": 3249, + "time_per_iteration": 2.78902530670166 + }, + { + "auxiliary_loss_clip": 0.01087772, + "auxiliary_loss_mlp": 0.01087149, + "balance_loss_clip": 1.02611208, + "balance_loss_mlp": 1.00627708, + "epoch": 0.39078939457704565, + "flos": 20558572122240.0, + "grad_norm": 1.7693075572992962, + "language_loss": 0.73227, + "learning_rate": 2.7831430082347143e-06, + "loss": 0.75401914, + "num_input_tokens_seen": 70005260, + "step": 3250, + "time_per_iteration": 3.8802616596221924 + }, + { + "auxiliary_loss_clip": 0.01141426, + "auxiliary_loss_mlp": 0.00873316, + "balance_loss_clip": 1.03593802, + "balance_loss_mlp": 1.00037789, + "epoch": 0.3909096374676847, + "flos": 22783417557120.0, + "grad_norm": 1.9689793329179903, + "language_loss": 0.8232702, + "learning_rate": 2.7824261798959373e-06, + "loss": 0.84341764, + "num_input_tokens_seen": 70023440, + "step": 3251, + "time_per_iteration": 2.7016193866729736 + }, + { + "auxiliary_loss_clip": 0.01130746, + "auxiliary_loss_mlp": 0.01085545, + "balance_loss_clip": 1.0331192, + "balance_loss_mlp": 1.00476861, + "epoch": 0.3910298803583238, + "flos": 23003119094400.0, + "grad_norm": 1.8584588959798252, + "language_loss": 0.7989893, + "learning_rate": 2.78170923286373e-06, + "loss": 0.82115221, + "num_input_tokens_seen": 70043040, + "step": 3252, + "time_per_iteration": 2.8147964477539062 + }, + { + "auxiliary_loss_clip": 0.01079369, + "auxiliary_loss_mlp": 0.01087018, + "balance_loss_clip": 1.02588546, + "balance_loss_mlp": 1.00614607, + "epoch": 0.3911501232489629, + "flos": 24316264500480.0, + "grad_norm": 2.2135005540919983, + "language_loss": 0.83679831, + "learning_rate": 2.780992167246854e-06, + "loss": 0.85846221, + "num_input_tokens_seen": 70060565, + "step": 3253, + "time_per_iteration": 2.9417364597320557 + }, + { + "auxiliary_loss_clip": 0.0112134, + "auxiliary_loss_mlp": 0.01079648, + "balance_loss_clip": 1.03909469, + "balance_loss_mlp": 1.0003022, + "epoch": 0.391270366139602, + "flos": 60869054684160.0, + "grad_norm": 0.9726105209950275, + "language_loss": 0.72149646, + "learning_rate": 2.7802749831540883e-06, + "loss": 0.74350637, + "num_input_tokens_seen": 70119465, + "step": 3254, + "time_per_iteration": 3.3226141929626465 + }, + { + "auxiliary_loss_clip": 0.01094958, + "auxiliary_loss_mlp": 0.01084928, + "balance_loss_clip": 1.03124094, + "balance_loss_mlp": 1.00439024, + "epoch": 0.3913906090302411, + "flos": 21543494025600.0, + "grad_norm": 2.0111266376545993, + "language_loss": 0.81863683, + "learning_rate": 2.7795576806942268e-06, + "loss": 0.84043574, + "num_input_tokens_seen": 70138270, + "step": 3255, + "time_per_iteration": 2.876962423324585 + }, + { + "auxiliary_loss_clip": 0.01109355, + "auxiliary_loss_mlp": 0.01079281, + "balance_loss_clip": 1.0285852, + "balance_loss_mlp": 0.99993497, + "epoch": 0.3915108519208802, + "flos": 49839953702400.0, + "grad_norm": 0.7703339792067305, + "language_loss": 0.54959702, + "learning_rate": 2.778840259976085e-06, + "loss": 0.57148337, + "num_input_tokens_seen": 70193500, + "step": 3256, + "time_per_iteration": 3.261322259902954 + }, + { + "auxiliary_loss_clip": 0.01137706, + "auxiliary_loss_mlp": 0.01085388, + "balance_loss_clip": 1.03377557, + "balance_loss_mlp": 1.0045166, + "epoch": 0.39163109481151925, + "flos": 16506447960960.0, + "grad_norm": 2.099846587531281, + "language_loss": 0.77068973, + "learning_rate": 2.778122721108495e-06, + "loss": 0.79292071, + "num_input_tokens_seen": 70211730, + "step": 3257, + "time_per_iteration": 2.76309871673584 + }, + { + "auxiliary_loss_clip": 0.01140519, + "auxiliary_loss_mlp": 0.010844, + "balance_loss_clip": 1.03549552, + "balance_loss_mlp": 1.00367188, + "epoch": 0.39175133770215836, + "flos": 26067484177920.0, + "grad_norm": 2.7735272002569573, + "language_loss": 0.88244694, + "learning_rate": 2.7774050642003076e-06, + "loss": 0.90469611, + "num_input_tokens_seen": 70232540, + "step": 3258, + "time_per_iteration": 2.7936363220214844 + }, + { + "auxiliary_loss_clip": 0.01148177, + "auxiliary_loss_mlp": 0.01086484, + "balance_loss_clip": 1.03437591, + "balance_loss_mlp": 1.00551724, + "epoch": 0.3918715805927975, + "flos": 21872076664320.0, + "grad_norm": 2.303537748076425, + "language_loss": 0.93667638, + "learning_rate": 2.7766872893603896e-06, + "loss": 0.959023, + "num_input_tokens_seen": 70252515, + "step": 3259, + "time_per_iteration": 2.6913294792175293 + }, + { + "auxiliary_loss_clip": 0.01141041, + "auxiliary_loss_mlp": 0.01085748, + "balance_loss_clip": 1.03550386, + "balance_loss_mlp": 1.00506735, + "epoch": 0.39199182348343653, + "flos": 20376181837440.0, + "grad_norm": 1.640542696002812, + "language_loss": 0.73181891, + "learning_rate": 2.7759693966976275e-06, + "loss": 0.75408679, + "num_input_tokens_seen": 70271020, + "step": 3260, + "time_per_iteration": 2.690396308898926 + }, + { + "auxiliary_loss_clip": 0.01109849, + "auxiliary_loss_mlp": 0.01087271, + "balance_loss_clip": 1.02852356, + "balance_loss_mlp": 1.00625634, + "epoch": 0.39211206637407564, + "flos": 21683545153920.0, + "grad_norm": 2.0780968662081687, + "language_loss": 0.84937644, + "learning_rate": 2.7752513863209242e-06, + "loss": 0.87134767, + "num_input_tokens_seen": 70289600, + "step": 3261, + "time_per_iteration": 2.803415536880493 + }, + { + "auxiliary_loss_clip": 0.01126168, + "auxiliary_loss_mlp": 0.00873236, + "balance_loss_clip": 1.03257871, + "balance_loss_mlp": 1.00027895, + "epoch": 0.39223230926471475, + "flos": 21066276908160.0, + "grad_norm": 1.709460343629269, + "language_loss": 0.83855224, + "learning_rate": 2.774533258339203e-06, + "loss": 0.85854626, + "num_input_tokens_seen": 70307060, + "step": 3262, + "time_per_iteration": 2.760822296142578 + }, + { + "auxiliary_loss_clip": 0.010975, + "auxiliary_loss_mlp": 0.01087353, + "balance_loss_clip": 1.03115809, + "balance_loss_mlp": 1.00624323, + "epoch": 0.3923525521553538, + "flos": 17603016312960.0, + "grad_norm": 2.109367809055764, + "language_loss": 0.79379433, + "learning_rate": 2.7738150128614014e-06, + "loss": 0.81564289, + "num_input_tokens_seen": 70324465, + "step": 3263, + "time_per_iteration": 2.7525203227996826 + }, + { + "auxiliary_loss_clip": 0.01116785, + "auxiliary_loss_mlp": 0.01085244, + "balance_loss_clip": 1.03021681, + "balance_loss_mlp": 1.00442016, + "epoch": 0.3924727950459929, + "flos": 20558284813440.0, + "grad_norm": 1.8080163727305962, + "language_loss": 0.89852738, + "learning_rate": 2.7730966499964777e-06, + "loss": 0.9205476, + "num_input_tokens_seen": 70341415, + "step": 3264, + "time_per_iteration": 2.8064308166503906 + }, + { + "auxiliary_loss_clip": 0.01146893, + "auxiliary_loss_mlp": 0.01086201, + "balance_loss_clip": 1.03315103, + "balance_loss_mlp": 1.00537729, + "epoch": 0.39259303793663197, + "flos": 16216110328320.0, + "grad_norm": 2.4420243068747833, + "language_loss": 0.80965632, + "learning_rate": 2.772378169853408e-06, + "loss": 0.83198726, + "num_input_tokens_seen": 70358985, + "step": 3265, + "time_per_iteration": 3.5118727684020996 + }, + { + "auxiliary_loss_clip": 0.01117491, + "auxiliary_loss_mlp": 0.01086317, + "balance_loss_clip": 1.03147447, + "balance_loss_mlp": 1.00558877, + "epoch": 0.3927132808272711, + "flos": 16797001075200.0, + "grad_norm": 1.9965086089926243, + "language_loss": 0.74087203, + "learning_rate": 2.771659572541183e-06, + "loss": 0.76291013, + "num_input_tokens_seen": 70376915, + "step": 3266, + "time_per_iteration": 2.8091254234313965 + }, + { + "auxiliary_loss_clip": 0.01139546, + "auxiliary_loss_mlp": 0.01086694, + "balance_loss_clip": 1.03454709, + "balance_loss_mlp": 1.00591779, + "epoch": 0.3928335237179102, + "flos": 20267228908800.0, + "grad_norm": 2.617323048803224, + "language_loss": 0.86949265, + "learning_rate": 2.7709408581688143e-06, + "loss": 0.89175504, + "num_input_tokens_seen": 70396900, + "step": 3267, + "time_per_iteration": 2.691227912902832 + }, + { + "auxiliary_loss_clip": 0.01106039, + "auxiliary_loss_mlp": 0.01087541, + "balance_loss_clip": 1.03334355, + "balance_loss_mlp": 1.00676465, + "epoch": 0.39295376660854925, + "flos": 24973250209920.0, + "grad_norm": 1.5584110524592372, + "language_loss": 0.87684232, + "learning_rate": 2.7702220268453307e-06, + "loss": 0.89877808, + "num_input_tokens_seen": 70417260, + "step": 3268, + "time_per_iteration": 2.789405584335327 + }, + { + "auxiliary_loss_clip": 0.0112642, + "auxiliary_loss_mlp": 0.01084526, + "balance_loss_clip": 1.03076744, + "balance_loss_mlp": 1.00365496, + "epoch": 0.39307400949918836, + "flos": 18697788984960.0, + "grad_norm": 2.09854254598612, + "language_loss": 0.85070968, + "learning_rate": 2.7695030786797785e-06, + "loss": 0.87281919, + "num_input_tokens_seen": 70433155, + "step": 3269, + "time_per_iteration": 3.6552655696868896 + }, + { + "auxiliary_loss_clip": 0.01109638, + "auxiliary_loss_mlp": 0.01086628, + "balance_loss_clip": 1.03104138, + "balance_loss_mlp": 1.00575662, + "epoch": 0.39319425238982747, + "flos": 22415476590720.0, + "grad_norm": 1.9777606047209892, + "language_loss": 0.75085706, + "learning_rate": 2.7687840137812206e-06, + "loss": 0.77281976, + "num_input_tokens_seen": 70451240, + "step": 3270, + "time_per_iteration": 2.8442134857177734 + }, + { + "auxiliary_loss_clip": 0.01124198, + "auxiliary_loss_mlp": 0.01079367, + "balance_loss_clip": 1.03528357, + "balance_loss_mlp": 1.00002098, + "epoch": 0.3933144952804665, + "flos": 66192954762240.0, + "grad_norm": 0.7937020936928997, + "language_loss": 0.62098271, + "learning_rate": 2.7680648322587395e-06, + "loss": 0.64301831, + "num_input_tokens_seen": 70516115, + "step": 3271, + "time_per_iteration": 3.3013694286346436 + }, + { + "auxiliary_loss_clip": 0.01146552, + "auxiliary_loss_mlp": 0.01085308, + "balance_loss_clip": 1.03385735, + "balance_loss_mlp": 1.0045321, + "epoch": 0.39343473817110564, + "flos": 15487159720320.0, + "grad_norm": 1.7857384984174511, + "language_loss": 0.80937219, + "learning_rate": 2.7673455342214334e-06, + "loss": 0.83169079, + "num_input_tokens_seen": 70533105, + "step": 3272, + "time_per_iteration": 2.719925880432129 + }, + { + "auxiliary_loss_clip": 0.01137704, + "auxiliary_loss_mlp": 0.01086537, + "balance_loss_clip": 1.03337097, + "balance_loss_mlp": 1.00580859, + "epoch": 0.39355498106174475, + "flos": 21324905809920.0, + "grad_norm": 2.2886624282442756, + "language_loss": 0.75925368, + "learning_rate": 2.7666261197784198e-06, + "loss": 0.78149605, + "num_input_tokens_seen": 70551920, + "step": 3273, + "time_per_iteration": 4.39775276184082 + }, + { + "auxiliary_loss_clip": 0.01119424, + "auxiliary_loss_mlp": 0.01086429, + "balance_loss_clip": 1.03033984, + "balance_loss_mlp": 1.0056529, + "epoch": 0.3936752239523838, + "flos": 13296357400320.0, + "grad_norm": 1.9202563074207677, + "language_loss": 0.76052719, + "learning_rate": 2.7659065890388336e-06, + "loss": 0.78258574, + "num_input_tokens_seen": 70567920, + "step": 3274, + "time_per_iteration": 2.8748581409454346 + }, + { + "auxiliary_loss_clip": 0.01127227, + "auxiliary_loss_mlp": 0.01086489, + "balance_loss_clip": 1.03136349, + "balance_loss_mlp": 1.00561714, + "epoch": 0.3937954668430229, + "flos": 16800161472000.0, + "grad_norm": 2.129142213248035, + "language_loss": 0.8460691, + "learning_rate": 2.7651869421118266e-06, + "loss": 0.86820626, + "num_input_tokens_seen": 70584530, + "step": 3275, + "time_per_iteration": 2.8326244354248047 + }, + { + "auxiliary_loss_clip": 0.01124852, + "auxiliary_loss_mlp": 0.01087471, + "balance_loss_clip": 1.03568053, + "balance_loss_mlp": 1.00664675, + "epoch": 0.393915709733662, + "flos": 21064229832960.0, + "grad_norm": 1.6908427239909807, + "language_loss": 0.82855237, + "learning_rate": 2.76446717910657e-06, + "loss": 0.85067558, + "num_input_tokens_seen": 70605235, + "step": 3276, + "time_per_iteration": 3.780428409576416 + }, + { + "auxiliary_loss_clip": 0.01134569, + "auxiliary_loss_mlp": 0.01087045, + "balance_loss_clip": 1.03137004, + "balance_loss_mlp": 1.00617325, + "epoch": 0.3940359526243011, + "flos": 17165265264000.0, + "grad_norm": 2.47356977506026, + "language_loss": 0.77335286, + "learning_rate": 2.763747300132249e-06, + "loss": 0.79556906, + "num_input_tokens_seen": 70622675, + "step": 3277, + "time_per_iteration": 2.7967724800109863 + }, + { + "auxiliary_loss_clip": 0.01146298, + "auxiliary_loss_mlp": 0.01086309, + "balance_loss_clip": 1.03299856, + "balance_loss_mlp": 1.00558019, + "epoch": 0.3941561955149402, + "flos": 20995856294400.0, + "grad_norm": 1.565894089107575, + "language_loss": 0.86507607, + "learning_rate": 2.7630273052980704e-06, + "loss": 0.88740218, + "num_input_tokens_seen": 70643265, + "step": 3278, + "time_per_iteration": 2.7855987548828125 + }, + { + "auxiliary_loss_clip": 0.01127077, + "auxiliary_loss_mlp": 0.01084985, + "balance_loss_clip": 1.03110552, + "balance_loss_mlp": 1.00430453, + "epoch": 0.39427643840557924, + "flos": 18843406721280.0, + "grad_norm": 1.895204632078573, + "language_loss": 0.67361963, + "learning_rate": 2.7623071947132554e-06, + "loss": 0.69574022, + "num_input_tokens_seen": 70660295, + "step": 3279, + "time_per_iteration": 2.811218500137329 + }, + { + "auxiliary_loss_clip": 0.01129138, + "auxiliary_loss_mlp": 0.01085725, + "balance_loss_clip": 1.03214741, + "balance_loss_mlp": 1.00490117, + "epoch": 0.39439668129621835, + "flos": 23258659426560.0, + "grad_norm": 2.410444513162283, + "language_loss": 0.78470576, + "learning_rate": 2.7615869684870458e-06, + "loss": 0.80685437, + "num_input_tokens_seen": 70679605, + "step": 3280, + "time_per_iteration": 2.8354451656341553 + }, + { + "auxiliary_loss_clip": 0.01137648, + "auxiliary_loss_mlp": 0.01086654, + "balance_loss_clip": 1.03385925, + "balance_loss_mlp": 1.00587773, + "epoch": 0.39451692418685746, + "flos": 26652289507200.0, + "grad_norm": 1.5771565259069005, + "language_loss": 0.84735602, + "learning_rate": 2.7608666267286986e-06, + "loss": 0.8695991, + "num_input_tokens_seen": 70699835, + "step": 3281, + "time_per_iteration": 2.739116907119751 + }, + { + "auxiliary_loss_clip": 0.01097129, + "auxiliary_loss_mlp": 0.01086131, + "balance_loss_clip": 1.02781427, + "balance_loss_mlp": 1.00511682, + "epoch": 0.3946371670774965, + "flos": 18258709132800.0, + "grad_norm": 2.0331245291218933, + "language_loss": 0.86480302, + "learning_rate": 2.760146169547489e-06, + "loss": 0.8866356, + "num_input_tokens_seen": 70716600, + "step": 3282, + "time_per_iteration": 2.861483097076416 + }, + { + "auxiliary_loss_clip": 0.01129003, + "auxiliary_loss_mlp": 0.01086732, + "balance_loss_clip": 1.03400123, + "balance_loss_mlp": 1.00590801, + "epoch": 0.39475740996813563, + "flos": 24206126423040.0, + "grad_norm": 2.224770428555841, + "language_loss": 0.7665509, + "learning_rate": 2.75942559705271e-06, + "loss": 0.78870827, + "num_input_tokens_seen": 70736335, + "step": 3283, + "time_per_iteration": 2.753282308578491 + }, + { + "auxiliary_loss_clip": 0.01129315, + "auxiliary_loss_mlp": 0.01086023, + "balance_loss_clip": 1.0308497, + "balance_loss_mlp": 1.00519896, + "epoch": 0.39487765285877474, + "flos": 19317858491520.0, + "grad_norm": 2.1425846043432992, + "language_loss": 0.89091861, + "learning_rate": 2.7587049093536713e-06, + "loss": 0.91307199, + "num_input_tokens_seen": 70752665, + "step": 3284, + "time_per_iteration": 2.707719564437866 + }, + { + "auxiliary_loss_clip": 0.01122451, + "auxiliary_loss_mlp": 0.01084812, + "balance_loss_clip": 1.03257656, + "balance_loss_mlp": 1.00408316, + "epoch": 0.3949978957494138, + "flos": 17311744926720.0, + "grad_norm": 1.869537141064487, + "language_loss": 0.8066175, + "learning_rate": 2.757984106559701e-06, + "loss": 0.82869017, + "num_input_tokens_seen": 70771650, + "step": 3285, + "time_per_iteration": 2.710103988647461 + }, + { + "auxiliary_loss_clip": 0.0112972, + "auxiliary_loss_mlp": 0.01086987, + "balance_loss_clip": 1.03403616, + "balance_loss_mlp": 1.00621128, + "epoch": 0.3951181386400529, + "flos": 36317861280000.0, + "grad_norm": 2.785094506530883, + "language_loss": 0.71220541, + "learning_rate": 2.7572631887801446e-06, + "loss": 0.73437256, + "num_input_tokens_seen": 70793275, + "step": 3286, + "time_per_iteration": 2.923990488052368 + }, + { + "auxiliary_loss_clip": 0.0113519, + "auxiliary_loss_mlp": 0.010848, + "balance_loss_clip": 1.03191304, + "balance_loss_mlp": 1.00392854, + "epoch": 0.395238381530692, + "flos": 23110348170240.0, + "grad_norm": 1.6275026003102129, + "language_loss": 0.76456785, + "learning_rate": 2.7565421561243654e-06, + "loss": 0.78676772, + "num_input_tokens_seen": 70811440, + "step": 3287, + "time_per_iteration": 2.7324819564819336 + }, + { + "auxiliary_loss_clip": 0.0111834, + "auxiliary_loss_mlp": 0.01086424, + "balance_loss_clip": 1.03123295, + "balance_loss_mlp": 1.0056951, + "epoch": 0.3953586244213311, + "flos": 24347614095360.0, + "grad_norm": 1.9083614333661716, + "language_loss": 0.82235587, + "learning_rate": 2.7558210087017413e-06, + "loss": 0.84440351, + "num_input_tokens_seen": 70831375, + "step": 3288, + "time_per_iteration": 2.8335886001586914 + }, + { + "auxiliary_loss_clip": 0.01108718, + "auxiliary_loss_mlp": 0.01085073, + "balance_loss_clip": 1.02564454, + "balance_loss_mlp": 1.00424886, + "epoch": 0.3954788673119702, + "flos": 23440080044160.0, + "grad_norm": 2.028244369929101, + "language_loss": 0.73494947, + "learning_rate": 2.7550997466216724e-06, + "loss": 0.75688744, + "num_input_tokens_seen": 70849170, + "step": 3289, + "time_per_iteration": 2.798814296722412 + }, + { + "auxiliary_loss_clip": 0.01121837, + "auxiliary_loss_mlp": 0.01085545, + "balance_loss_clip": 1.02906597, + "balance_loss_mlp": 1.00481594, + "epoch": 0.3955991102026093, + "flos": 17494063384320.0, + "grad_norm": 1.767644875856087, + "language_loss": 0.81640267, + "learning_rate": 2.7543783699935714e-06, + "loss": 0.83847654, + "num_input_tokens_seen": 70867200, + "step": 3290, + "time_per_iteration": 2.7329602241516113 + }, + { + "auxiliary_loss_clip": 0.01136395, + "auxiliary_loss_mlp": 0.01085942, + "balance_loss_clip": 1.03396749, + "balance_loss_mlp": 1.00511801, + "epoch": 0.39571935309324835, + "flos": 18221326053120.0, + "grad_norm": 2.4357129447247603, + "language_loss": 0.86239624, + "learning_rate": 2.753656878926872e-06, + "loss": 0.88461965, + "num_input_tokens_seen": 70883080, + "step": 3291, + "time_per_iteration": 3.6572179794311523 + }, + { + "auxiliary_loss_clip": 0.01125442, + "auxiliary_loss_mlp": 0.01085489, + "balance_loss_clip": 1.03034949, + "balance_loss_mlp": 1.00471282, + "epoch": 0.39583959598388746, + "flos": 17748813617280.0, + "grad_norm": 1.735409511541366, + "language_loss": 0.74155486, + "learning_rate": 2.752935273531023e-06, + "loss": 0.76366413, + "num_input_tokens_seen": 70901230, + "step": 3292, + "time_per_iteration": 2.7012081146240234 + }, + { + "auxiliary_loss_clip": 0.01136529, + "auxiliary_loss_mlp": 0.01085539, + "balance_loss_clip": 1.03278542, + "balance_loss_mlp": 1.00461984, + "epoch": 0.39595983887452657, + "flos": 19352368483200.0, + "grad_norm": 2.151088441871211, + "language_loss": 0.78713787, + "learning_rate": 2.752213553915492e-06, + "loss": 0.80935848, + "num_input_tokens_seen": 70919585, + "step": 3293, + "time_per_iteration": 2.71598219871521 + }, + { + "auxiliary_loss_clip": 0.01117608, + "auxiliary_loss_mlp": 0.01079869, + "balance_loss_clip": 1.0364238, + "balance_loss_mlp": 1.00052357, + "epoch": 0.3960800817651656, + "flos": 60682282940160.0, + "grad_norm": 0.8155449132510406, + "language_loss": 0.66102028, + "learning_rate": 2.751491720189762e-06, + "loss": 0.68299508, + "num_input_tokens_seen": 70977695, + "step": 3294, + "time_per_iteration": 3.2455532550811768 + }, + { + "auxiliary_loss_clip": 0.01127292, + "auxiliary_loss_mlp": 0.00873321, + "balance_loss_clip": 1.03235221, + "balance_loss_mlp": 1.00036573, + "epoch": 0.39620032465580474, + "flos": 16836718538880.0, + "grad_norm": 5.554271115384624, + "language_loss": 0.91500723, + "learning_rate": 2.7507697724633364e-06, + "loss": 0.93501341, + "num_input_tokens_seen": 70994455, + "step": 3295, + "time_per_iteration": 3.6467294692993164 + }, + { + "auxiliary_loss_clip": 0.01098568, + "auxiliary_loss_mlp": 0.01079658, + "balance_loss_clip": 1.02621412, + "balance_loss_mlp": 1.00031233, + "epoch": 0.3963205675464438, + "flos": 69071445941760.0, + "grad_norm": 0.7754410947209359, + "language_loss": 0.5465681, + "learning_rate": 2.7500477108457327e-06, + "loss": 0.56835037, + "num_input_tokens_seen": 71046465, + "step": 3296, + "time_per_iteration": 3.1364307403564453 + }, + { + "auxiliary_loss_clip": 0.01136551, + "auxiliary_loss_mlp": 0.01085462, + "balance_loss_clip": 1.03263092, + "balance_loss_mlp": 1.00473356, + "epoch": 0.3964408104370829, + "flos": 25667439431040.0, + "grad_norm": 2.189980773690078, + "language_loss": 0.80871797, + "learning_rate": 2.7493255354464877e-06, + "loss": 0.8309381, + "num_input_tokens_seen": 71064275, + "step": 3297, + "time_per_iteration": 2.724168062210083 + }, + { + "auxiliary_loss_clip": 0.0106899, + "auxiliary_loss_mlp": 0.01086576, + "balance_loss_clip": 1.02613425, + "balance_loss_mlp": 1.00579989, + "epoch": 0.396561053327722, + "flos": 24277480790400.0, + "grad_norm": 1.7829871809623112, + "language_loss": 0.7604996, + "learning_rate": 2.748603246375156e-06, + "loss": 0.78205526, + "num_input_tokens_seen": 71082290, + "step": 3298, + "time_per_iteration": 3.0088350772857666 + }, + { + "auxiliary_loss_clip": 0.01147323, + "auxiliary_loss_mlp": 0.01087464, + "balance_loss_clip": 1.03427839, + "balance_loss_mlp": 1.00673568, + "epoch": 0.39668129621836107, + "flos": 20522302364160.0, + "grad_norm": 2.787370759605996, + "language_loss": 0.69493484, + "learning_rate": 2.7478808437413055e-06, + "loss": 0.71728271, + "num_input_tokens_seen": 71101700, + "step": 3299, + "time_per_iteration": 3.870774507522583 + }, + { + "auxiliary_loss_clip": 0.01099385, + "auxiliary_loss_mlp": 0.01085712, + "balance_loss_clip": 1.02415824, + "balance_loss_mlp": 1.0049839, + "epoch": 0.3968015391090002, + "flos": 27052585649280.0, + "grad_norm": 1.715717495901068, + "language_loss": 0.65867555, + "learning_rate": 2.7471583276545263e-06, + "loss": 0.68052661, + "num_input_tokens_seen": 71122360, + "step": 3300, + "time_per_iteration": 2.8516860008239746 + }, + { + "auxiliary_loss_clip": 0.01125027, + "auxiliary_loss_mlp": 0.01085262, + "balance_loss_clip": 1.02955556, + "balance_loss_mlp": 1.00439048, + "epoch": 0.3969217819996393, + "flos": 12531819392640.0, + "grad_norm": 2.1066447884425092, + "language_loss": 0.70381832, + "learning_rate": 2.7464356982244224e-06, + "loss": 0.72592127, + "num_input_tokens_seen": 71140360, + "step": 3301, + "time_per_iteration": 3.738022804260254 + }, + { + "auxiliary_loss_clip": 0.01118804, + "auxiliary_loss_mlp": 0.01079695, + "balance_loss_clip": 1.02987087, + "balance_loss_mlp": 1.00034893, + "epoch": 0.39704202489027834, + "flos": 66241399230720.0, + "grad_norm": 0.773410233184143, + "language_loss": 0.61712599, + "learning_rate": 2.745712955560617e-06, + "loss": 0.63911092, + "num_input_tokens_seen": 71196565, + "step": 3302, + "time_per_iteration": 3.3823165893554688 + }, + { + "auxiliary_loss_clip": 0.01099752, + "auxiliary_loss_mlp": 0.01086555, + "balance_loss_clip": 1.02965379, + "balance_loss_mlp": 1.00573087, + "epoch": 0.39716226778091746, + "flos": 16982982720000.0, + "grad_norm": 2.2970520585685854, + "language_loss": 0.76716936, + "learning_rate": 2.7449900997727496e-06, + "loss": 0.78903234, + "num_input_tokens_seen": 71214675, + "step": 3303, + "time_per_iteration": 2.899805784225464 + }, + { + "auxiliary_loss_clip": 0.01123155, + "auxiliary_loss_mlp": 0.01086898, + "balance_loss_clip": 1.02962601, + "balance_loss_mlp": 1.00602639, + "epoch": 0.39728251067155657, + "flos": 23477139901440.0, + "grad_norm": 2.3708334723648705, + "language_loss": 0.84067816, + "learning_rate": 2.744267130970476e-06, + "loss": 0.86277866, + "num_input_tokens_seen": 71234400, + "step": 3304, + "time_per_iteration": 2.7839653491973877 + }, + { + "auxiliary_loss_clip": 0.0112767, + "auxiliary_loss_mlp": 0.01086873, + "balance_loss_clip": 1.03254128, + "balance_loss_mlp": 1.00595367, + "epoch": 0.3974027535621956, + "flos": 20704441253760.0, + "grad_norm": 1.742679675508003, + "language_loss": 0.77169871, + "learning_rate": 2.7435440492634697e-06, + "loss": 0.7938441, + "num_input_tokens_seen": 71253725, + "step": 3305, + "time_per_iteration": 2.778219223022461 + }, + { + "auxiliary_loss_clip": 0.01118105, + "auxiliary_loss_mlp": 0.0108629, + "balance_loss_clip": 1.02883255, + "balance_loss_mlp": 1.00513232, + "epoch": 0.39752299645283473, + "flos": 21543278544000.0, + "grad_norm": 2.006005164043551, + "language_loss": 0.66906899, + "learning_rate": 2.7428208547614228e-06, + "loss": 0.691113, + "num_input_tokens_seen": 71273220, + "step": 3306, + "time_per_iteration": 2.7423367500305176 + }, + { + "auxiliary_loss_clip": 0.01136511, + "auxiliary_loss_mlp": 0.01086012, + "balance_loss_clip": 1.0328269, + "balance_loss_mlp": 1.00514054, + "epoch": 0.39764323934347384, + "flos": 19208295031680.0, + "grad_norm": 3.0629578682938607, + "language_loss": 0.77097201, + "learning_rate": 2.742097547574043e-06, + "loss": 0.79319721, + "num_input_tokens_seen": 71291445, + "step": 3307, + "time_per_iteration": 2.703503370285034 + }, + { + "auxiliary_loss_clip": 0.01129424, + "auxiliary_loss_mlp": 0.0087342, + "balance_loss_clip": 1.03310418, + "balance_loss_mlp": 1.00035262, + "epoch": 0.3977634822341129, + "flos": 20850202644480.0, + "grad_norm": 2.072119559729427, + "language_loss": 0.7806766, + "learning_rate": 2.7413741278110544e-06, + "loss": 0.80070508, + "num_input_tokens_seen": 71310135, + "step": 3308, + "time_per_iteration": 2.795400619506836 + }, + { + "auxiliary_loss_clip": 0.01127454, + "auxiliary_loss_mlp": 0.01084637, + "balance_loss_clip": 1.03203487, + "balance_loss_mlp": 1.00386047, + "epoch": 0.397883725124752, + "flos": 39786042038400.0, + "grad_norm": 2.1910914724247585, + "language_loss": 0.68759006, + "learning_rate": 2.7406505955822016e-06, + "loss": 0.70971096, + "num_input_tokens_seen": 71331160, + "step": 3309, + "time_per_iteration": 2.85427188873291 + }, + { + "auxiliary_loss_clip": 0.01131152, + "auxiliary_loss_mlp": 0.01086353, + "balance_loss_clip": 1.0337981, + "balance_loss_mlp": 1.00543404, + "epoch": 0.39800396801539106, + "flos": 17379507934080.0, + "grad_norm": 3.2636801824912194, + "language_loss": 0.66166186, + "learning_rate": 2.7399269509972415e-06, + "loss": 0.68383694, + "num_input_tokens_seen": 71345315, + "step": 3310, + "time_per_iteration": 2.7243406772613525 + }, + { + "auxiliary_loss_clip": 0.01129595, + "auxiliary_loss_mlp": 0.0108671, + "balance_loss_clip": 1.03171778, + "balance_loss_mlp": 1.00555265, + "epoch": 0.3981242109060302, + "flos": 19202764337280.0, + "grad_norm": 2.118799519727646, + "language_loss": 0.85210156, + "learning_rate": 2.7392031941659514e-06, + "loss": 0.87426454, + "num_input_tokens_seen": 71363160, + "step": 3311, + "time_per_iteration": 2.7897040843963623 + }, + { + "auxiliary_loss_clip": 0.01127338, + "auxiliary_loss_mlp": 0.01086277, + "balance_loss_clip": 1.03320432, + "balance_loss_mlp": 1.00545323, + "epoch": 0.3982444537966693, + "flos": 24565124903040.0, + "grad_norm": 1.8041311560756201, + "language_loss": 0.85327882, + "learning_rate": 2.7384793251981244e-06, + "loss": 0.87541497, + "num_input_tokens_seen": 71382145, + "step": 3312, + "time_per_iteration": 2.7690587043762207 + }, + { + "auxiliary_loss_clip": 0.01139232, + "auxiliary_loss_mlp": 0.01086236, + "balance_loss_clip": 1.03368974, + "balance_loss_mlp": 1.00531697, + "epoch": 0.39836469668730834, + "flos": 26213856099840.0, + "grad_norm": 2.0153789831003177, + "language_loss": 0.80471075, + "learning_rate": 2.737755344203571e-06, + "loss": 0.82696539, + "num_input_tokens_seen": 71402095, + "step": 3313, + "time_per_iteration": 2.979999542236328 + }, + { + "auxiliary_loss_clip": 0.01137731, + "auxiliary_loss_mlp": 0.0108609, + "balance_loss_clip": 1.03312826, + "balance_loss_mlp": 1.00531352, + "epoch": 0.39848493957794745, + "flos": 27636134002560.0, + "grad_norm": 1.6226991915273044, + "language_loss": 0.79821378, + "learning_rate": 2.7370312512921186e-06, + "loss": 0.82045197, + "num_input_tokens_seen": 71423875, + "step": 3314, + "time_per_iteration": 2.7566182613372803 + }, + { + "auxiliary_loss_clip": 0.01128847, + "auxiliary_loss_mlp": 0.01086757, + "balance_loss_clip": 1.03246903, + "balance_loss_mlp": 1.00579, + "epoch": 0.39860518246858656, + "flos": 12239326944000.0, + "grad_norm": 2.6013178462894815, + "language_loss": 0.76624829, + "learning_rate": 2.736307046573611e-06, + "loss": 0.78840435, + "num_input_tokens_seen": 71439745, + "step": 3315, + "time_per_iteration": 2.7497572898864746 + }, + { + "auxiliary_loss_clip": 0.01148344, + "auxiliary_loss_mlp": 0.01085814, + "balance_loss_clip": 1.03510618, + "balance_loss_mlp": 1.00508523, + "epoch": 0.3987254253592256, + "flos": 22379135005440.0, + "grad_norm": 1.4924865352610015, + "language_loss": 0.81458622, + "learning_rate": 2.73558273015791e-06, + "loss": 0.83692783, + "num_input_tokens_seen": 71459575, + "step": 3316, + "time_per_iteration": 3.540384292602539 + }, + { + "auxiliary_loss_clip": 0.01148987, + "auxiliary_loss_mlp": 0.01086166, + "balance_loss_clip": 1.03571641, + "balance_loss_mlp": 1.00524664, + "epoch": 0.3988456682498647, + "flos": 23514020190720.0, + "grad_norm": 2.014660581262432, + "language_loss": 0.70819306, + "learning_rate": 2.734858302154894e-06, + "loss": 0.73054457, + "num_input_tokens_seen": 71481075, + "step": 3317, + "time_per_iteration": 2.6614439487457275 + }, + { + "auxiliary_loss_clip": 0.01125309, + "auxiliary_loss_mlp": 0.01084752, + "balance_loss_clip": 1.03181195, + "balance_loss_mlp": 1.0039283, + "epoch": 0.39896591114050384, + "flos": 19208761908480.0, + "grad_norm": 2.630173554634817, + "language_loss": 0.76587254, + "learning_rate": 2.734133762674457e-06, + "loss": 0.78797317, + "num_input_tokens_seen": 71500665, + "step": 3318, + "time_per_iteration": 2.740861177444458 + }, + { + "auxiliary_loss_clip": 0.01124439, + "auxiliary_loss_mlp": 0.01085417, + "balance_loss_clip": 1.02970135, + "balance_loss_mlp": 1.00454569, + "epoch": 0.3990861540311429, + "flos": 28401031146240.0, + "grad_norm": 2.5434811765299825, + "language_loss": 0.70892727, + "learning_rate": 2.7334091118265124e-06, + "loss": 0.73102582, + "num_input_tokens_seen": 71522560, + "step": 3319, + "time_per_iteration": 2.791710138320923 + }, + { + "auxiliary_loss_clip": 0.01130654, + "auxiliary_loss_mlp": 0.01079269, + "balance_loss_clip": 1.04084015, + "balance_loss_mlp": 0.99992287, + "epoch": 0.399206396921782, + "flos": 61758563086080.0, + "grad_norm": 0.6775830614246098, + "language_loss": 0.5782479, + "learning_rate": 2.732684349720989e-06, + "loss": 0.60034704, + "num_input_tokens_seen": 71590520, + "step": 3320, + "time_per_iteration": 3.2893741130828857 + }, + { + "auxiliary_loss_clip": 0.01100231, + "auxiliary_loss_mlp": 0.01086477, + "balance_loss_clip": 1.02893734, + "balance_loss_mlp": 1.00555766, + "epoch": 0.3993266398124211, + "flos": 28074567409920.0, + "grad_norm": 1.582314046589987, + "language_loss": 0.75158644, + "learning_rate": 2.7319594764678318e-06, + "loss": 0.77345347, + "num_input_tokens_seen": 71612620, + "step": 3321, + "time_per_iteration": 3.6888551712036133 + }, + { + "auxiliary_loss_clip": 0.01090456, + "auxiliary_loss_mlp": 0.01086716, + "balance_loss_clip": 1.02915251, + "balance_loss_mlp": 1.00574899, + "epoch": 0.39944688270306017, + "flos": 23225083188480.0, + "grad_norm": 2.270064499361544, + "language_loss": 0.8321352, + "learning_rate": 2.7312344921770044e-06, + "loss": 0.85390693, + "num_input_tokens_seen": 71634320, + "step": 3322, + "time_per_iteration": 2.8160908222198486 + }, + { + "auxiliary_loss_clip": 0.01130748, + "auxiliary_loss_mlp": 0.01085927, + "balance_loss_clip": 1.03394914, + "balance_loss_mlp": 1.00500751, + "epoch": 0.3995671255936993, + "flos": 19390433921280.0, + "grad_norm": 1.8208137111043519, + "language_loss": 0.78375685, + "learning_rate": 2.7305093969584857e-06, + "loss": 0.80592358, + "num_input_tokens_seen": 71653145, + "step": 3323, + "time_per_iteration": 2.7394344806671143 + }, + { + "auxiliary_loss_clip": 0.01139661, + "auxiliary_loss_mlp": 0.01085925, + "balance_loss_clip": 1.0342871, + "balance_loss_mlp": 1.00514829, + "epoch": 0.3996873684843384, + "flos": 23842638743040.0, + "grad_norm": 2.5515915086408065, + "language_loss": 0.79737234, + "learning_rate": 2.729784190922272e-06, + "loss": 0.81962818, + "num_input_tokens_seen": 71674580, + "step": 3324, + "time_per_iteration": 3.720991373062134 + }, + { + "auxiliary_loss_clip": 0.01103769, + "auxiliary_loss_mlp": 0.01079506, + "balance_loss_clip": 1.0384779, + "balance_loss_mlp": 1.00016046, + "epoch": 0.39980761137497745, + "flos": 66576877280640.0, + "grad_norm": 1.1329746976084796, + "language_loss": 0.57123297, + "learning_rate": 2.729058874178378e-06, + "loss": 0.5930658, + "num_input_tokens_seen": 71745260, + "step": 3325, + "time_per_iteration": 3.3145618438720703 + }, + { + "auxiliary_loss_clip": 0.01129038, + "auxiliary_loss_mlp": 0.01086648, + "balance_loss_clip": 1.03315997, + "balance_loss_mlp": 1.00563359, + "epoch": 0.39992785426561656, + "flos": 28549162834560.0, + "grad_norm": 1.840965815572912, + "language_loss": 0.69245076, + "learning_rate": 2.7283334468368315e-06, + "loss": 0.7146076, + "num_input_tokens_seen": 71766540, + "step": 3326, + "time_per_iteration": 2.8011322021484375 + }, + { + "auxiliary_loss_clip": 0.01068543, + "auxiliary_loss_mlp": 0.01085048, + "balance_loss_clip": 1.0289185, + "balance_loss_mlp": 1.00408089, + "epoch": 0.4000480971562556, + "flos": 15049408671360.0, + "grad_norm": 1.8447765923270658, + "language_loss": 0.73029053, + "learning_rate": 2.72760790900768e-06, + "loss": 0.75182641, + "num_input_tokens_seen": 71783125, + "step": 3327, + "time_per_iteration": 4.09112024307251 + }, + { + "auxiliary_loss_clip": 0.01147236, + "auxiliary_loss_mlp": 0.01085886, + "balance_loss_clip": 1.03380728, + "balance_loss_mlp": 1.0049665, + "epoch": 0.4001683400468947, + "flos": 23915609222400.0, + "grad_norm": 1.6317315060400546, + "language_loss": 0.78722274, + "learning_rate": 2.7268822608009875e-06, + "loss": 0.80955392, + "num_input_tokens_seen": 71802500, + "step": 3328, + "time_per_iteration": 2.939619302749634 + }, + { + "auxiliary_loss_clip": 0.01118386, + "auxiliary_loss_mlp": 0.01086504, + "balance_loss_clip": 1.03149748, + "balance_loss_mlp": 1.00548887, + "epoch": 0.40028858293753383, + "flos": 24352677912960.0, + "grad_norm": 1.7588040907629554, + "language_loss": 0.78447104, + "learning_rate": 2.726156502326834e-06, + "loss": 0.80651987, + "num_input_tokens_seen": 71823800, + "step": 3329, + "time_per_iteration": 268.9367971420288 + }, + { + "auxiliary_loss_clip": 0.01073057, + "auxiliary_loss_mlp": 0.01080474, + "balance_loss_clip": 1.03150272, + "balance_loss_mlp": 1.00112808, + "epoch": 0.4004088258281729, + "flos": 66787025800320.0, + "grad_norm": 0.6989007543716265, + "language_loss": 0.60236514, + "learning_rate": 2.725430633695316e-06, + "loss": 0.62390041, + "num_input_tokens_seen": 71886880, + "step": 3330, + "time_per_iteration": 3.864673614501953 + }, + { + "auxiliary_loss_clip": 0.01138181, + "auxiliary_loss_mlp": 0.01079705, + "balance_loss_clip": 1.04024935, + "balance_loss_mlp": 1.00035954, + "epoch": 0.400529068718812, + "flos": 58598386473600.0, + "grad_norm": 0.8851140995676199, + "language_loss": 0.57934076, + "learning_rate": 2.7247046550165485e-06, + "loss": 0.60151958, + "num_input_tokens_seen": 71939005, + "step": 3331, + "time_per_iteration": 3.3472020626068115 + }, + { + "auxiliary_loss_clip": 0.01147479, + "auxiliary_loss_mlp": 0.01086814, + "balance_loss_clip": 1.03450787, + "balance_loss_mlp": 1.00579929, + "epoch": 0.4006493116094511, + "flos": 25377460934400.0, + "grad_norm": 1.3618382728621796, + "language_loss": 0.76049352, + "learning_rate": 2.7239785664006606e-06, + "loss": 0.78283644, + "num_input_tokens_seen": 71962545, + "step": 3332, + "time_per_iteration": 2.8164589405059814 + }, + { + "auxiliary_loss_clip": 0.01129678, + "auxiliary_loss_mlp": 0.01079657, + "balance_loss_clip": 1.03981113, + "balance_loss_mlp": 1.00031149, + "epoch": 0.40076955450009016, + "flos": 60280729822080.0, + "grad_norm": 0.7666628001837756, + "language_loss": 0.61787856, + "learning_rate": 2.7232523679578002e-06, + "loss": 0.63997185, + "num_input_tokens_seen": 72025625, + "step": 3333, + "time_per_iteration": 3.3689355850219727 + }, + { + "auxiliary_loss_clip": 0.01137593, + "auxiliary_loss_mlp": 0.01086744, + "balance_loss_clip": 1.03353167, + "balance_loss_mlp": 1.00591993, + "epoch": 0.4008897973907293, + "flos": 16617268396800.0, + "grad_norm": 2.1703702156314906, + "language_loss": 0.79371744, + "learning_rate": 2.7225260597981295e-06, + "loss": 0.81596082, + "num_input_tokens_seen": 72043330, + "step": 3334, + "time_per_iteration": 2.757591485977173 + }, + { + "auxiliary_loss_clip": 0.01109507, + "auxiliary_loss_mlp": 0.00873459, + "balance_loss_clip": 1.02618217, + "balance_loss_mlp": 1.00029373, + "epoch": 0.4010100402813684, + "flos": 15377344865280.0, + "grad_norm": 2.8374315456543093, + "language_loss": 0.78771293, + "learning_rate": 2.721799642031831e-06, + "loss": 0.80754262, + "num_input_tokens_seen": 72059500, + "step": 3335, + "time_per_iteration": 2.9536991119384766 + }, + { + "auxiliary_loss_clip": 0.01131693, + "auxiliary_loss_mlp": 0.01085784, + "balance_loss_clip": 1.03413129, + "balance_loss_mlp": 1.00486493, + "epoch": 0.40113028317200744, + "flos": 13298835438720.0, + "grad_norm": 2.2913565378947474, + "language_loss": 0.77379274, + "learning_rate": 2.721073114769101e-06, + "loss": 0.79596758, + "num_input_tokens_seen": 72077175, + "step": 3336, + "time_per_iteration": 2.8225433826446533 + }, + { + "auxiliary_loss_clip": 0.01114395, + "auxiliary_loss_mlp": 0.01086985, + "balance_loss_clip": 1.0296886, + "balance_loss_mlp": 1.00611353, + "epoch": 0.40125052606264655, + "flos": 20668027841280.0, + "grad_norm": 1.7544350522717063, + "language_loss": 0.75124013, + "learning_rate": 2.7203464781201523e-06, + "loss": 0.77325392, + "num_input_tokens_seen": 72096490, + "step": 3337, + "time_per_iteration": 2.906627893447876 + }, + { + "auxiliary_loss_clip": 0.01148255, + "auxiliary_loss_mlp": 0.01086332, + "balance_loss_clip": 1.03454542, + "balance_loss_mlp": 1.00550866, + "epoch": 0.40137076895328566, + "flos": 24607679541120.0, + "grad_norm": 1.9856559377936631, + "language_loss": 0.78250277, + "learning_rate": 2.719619732195215e-06, + "loss": 0.80484861, + "num_input_tokens_seen": 72118130, + "step": 3338, + "time_per_iteration": 2.769456624984741 + }, + { + "auxiliary_loss_clip": 0.01115847, + "auxiliary_loss_mlp": 0.01087259, + "balance_loss_clip": 1.02970684, + "balance_loss_mlp": 1.00633931, + "epoch": 0.4014910118439247, + "flos": 24206593299840.0, + "grad_norm": 1.34580121391772, + "language_loss": 0.72568023, + "learning_rate": 2.7188928771045377e-06, + "loss": 0.7477113, + "num_input_tokens_seen": 72139450, + "step": 3339, + "time_per_iteration": 2.8924875259399414 + }, + { + "auxiliary_loss_clip": 0.01117565, + "auxiliary_loss_mlp": 0.01086215, + "balance_loss_clip": 1.03041911, + "balance_loss_mlp": 1.00520039, + "epoch": 0.4016112547345638, + "flos": 26725080418560.0, + "grad_norm": 2.3683421085958973, + "language_loss": 0.79881573, + "learning_rate": 2.7181659129583815e-06, + "loss": 0.82085359, + "num_input_tokens_seen": 72159040, + "step": 3340, + "time_per_iteration": 2.964232921600342 + }, + { + "auxiliary_loss_clip": 0.01129385, + "auxiliary_loss_mlp": 0.01085557, + "balance_loss_clip": 1.03215623, + "balance_loss_mlp": 1.0047332, + "epoch": 0.4017314976252029, + "flos": 21288025520640.0, + "grad_norm": 1.680511200666489, + "language_loss": 0.7574532, + "learning_rate": 2.7174388398670276e-06, + "loss": 0.77960265, + "num_input_tokens_seen": 72178220, + "step": 3341, + "time_per_iteration": 2.808171272277832 + }, + { + "auxiliary_loss_clip": 0.01145958, + "auxiliary_loss_mlp": 0.01085892, + "balance_loss_clip": 1.03249061, + "balance_loss_mlp": 1.00492477, + "epoch": 0.401851740515842, + "flos": 25484690010240.0, + "grad_norm": 1.7264729702044694, + "language_loss": 0.91838127, + "learning_rate": 2.716711657940773e-06, + "loss": 0.94069976, + "num_input_tokens_seen": 72199230, + "step": 3342, + "time_per_iteration": 3.6805787086486816 + }, + { + "auxiliary_loss_clip": 0.01094448, + "auxiliary_loss_mlp": 0.0107969, + "balance_loss_clip": 1.0377326, + "balance_loss_mlp": 1.00034451, + "epoch": 0.4019719834064811, + "flos": 55395334978560.0, + "grad_norm": 0.833426112525677, + "language_loss": 0.56503171, + "learning_rate": 2.7159843672899284e-06, + "loss": 0.5867731, + "num_input_tokens_seen": 72263430, + "step": 3343, + "time_per_iteration": 3.4975733757019043 + }, + { + "auxiliary_loss_clip": 0.0113715, + "auxiliary_loss_mlp": 0.01087153, + "balance_loss_clip": 1.03321719, + "balance_loss_mlp": 1.00613809, + "epoch": 0.40209222629712016, + "flos": 18180100218240.0, + "grad_norm": 1.7299968558776269, + "language_loss": 0.80996525, + "learning_rate": 2.715256968024825e-06, + "loss": 0.83220828, + "num_input_tokens_seen": 72280505, + "step": 3344, + "time_per_iteration": 2.7869064807891846 + }, + { + "auxiliary_loss_clip": 0.01113055, + "auxiliary_loss_mlp": 0.01085753, + "balance_loss_clip": 1.03186107, + "balance_loss_mlp": 1.0049293, + "epoch": 0.40221246918775927, + "flos": 25961009287680.0, + "grad_norm": 1.472020917339677, + "language_loss": 0.82446158, + "learning_rate": 2.7145294602558083e-06, + "loss": 0.84644973, + "num_input_tokens_seen": 72301215, + "step": 3345, + "time_per_iteration": 2.856299638748169 + }, + { + "auxiliary_loss_clip": 0.01136816, + "auxiliary_loss_mlp": 0.01086041, + "balance_loss_clip": 1.03250086, + "balance_loss_mlp": 1.00512147, + "epoch": 0.4023327120783984, + "flos": 33838912056960.0, + "grad_norm": 2.1389928136104572, + "language_loss": 0.70831794, + "learning_rate": 2.713801844093241e-06, + "loss": 0.73054647, + "num_input_tokens_seen": 72322365, + "step": 3346, + "time_per_iteration": 4.3688647747039795 + }, + { + "auxiliary_loss_clip": 0.01136394, + "auxiliary_loss_mlp": 0.01085391, + "balance_loss_clip": 1.03234065, + "balance_loss_mlp": 1.00461459, + "epoch": 0.40245295496903744, + "flos": 26900252069760.0, + "grad_norm": 1.8718834590115814, + "language_loss": 0.88362592, + "learning_rate": 2.7130741196475014e-06, + "loss": 0.90584373, + "num_input_tokens_seen": 72340495, + "step": 3347, + "time_per_iteration": 2.7943432331085205 + }, + { + "auxiliary_loss_clip": 0.01121916, + "auxiliary_loss_mlp": 0.01085202, + "balance_loss_clip": 1.02824187, + "balance_loss_mlp": 1.00423467, + "epoch": 0.40257319785967655, + "flos": 36902738436480.0, + "grad_norm": 1.7919525828683631, + "language_loss": 0.79129124, + "learning_rate": 2.7123462870289848e-06, + "loss": 0.81336242, + "num_input_tokens_seen": 72360545, + "step": 3348, + "time_per_iteration": 3.8593332767486572 + }, + { + "auxiliary_loss_clip": 0.01127821, + "auxiliary_loss_mlp": 0.01086051, + "balance_loss_clip": 1.03199649, + "balance_loss_mlp": 1.00517952, + "epoch": 0.40269344075031566, + "flos": 24353180703360.0, + "grad_norm": 1.5186761464686336, + "language_loss": 0.81199539, + "learning_rate": 2.711618346348102e-06, + "loss": 0.8341341, + "num_input_tokens_seen": 72381070, + "step": 3349, + "time_per_iteration": 2.8098368644714355 + }, + { + "auxiliary_loss_clip": 0.0112869, + "auxiliary_loss_mlp": 0.01085365, + "balance_loss_clip": 1.032763, + "balance_loss_mlp": 1.00463605, + "epoch": 0.4028136836409547, + "flos": 14389657614720.0, + "grad_norm": 2.141596151133365, + "language_loss": 0.63877189, + "learning_rate": 2.7108902977152825e-06, + "loss": 0.66091239, + "num_input_tokens_seen": 72398970, + "step": 3350, + "time_per_iteration": 2.814767360687256 + }, + { + "auxiliary_loss_clip": 0.01139316, + "auxiliary_loss_mlp": 0.01086902, + "balance_loss_clip": 1.03426552, + "balance_loss_mlp": 1.00588775, + "epoch": 0.4029339265315938, + "flos": 26136037284480.0, + "grad_norm": 1.9282026011333457, + "language_loss": 0.74726164, + "learning_rate": 2.7101621412409704e-06, + "loss": 0.7695238, + "num_input_tokens_seen": 72418455, + "step": 3351, + "time_per_iteration": 2.8215179443359375 + }, + { + "auxiliary_loss_clip": 0.01146998, + "auxiliary_loss_mlp": 0.01086459, + "balance_loss_clip": 1.03325415, + "balance_loss_mlp": 1.00549221, + "epoch": 0.40305416942223293, + "flos": 23256325042560.0, + "grad_norm": 2.3161477599845175, + "language_loss": 0.86160469, + "learning_rate": 2.7094338770356256e-06, + "loss": 0.88393927, + "num_input_tokens_seen": 72437540, + "step": 3352, + "time_per_iteration": 3.7698068618774414 + }, + { + "auxiliary_loss_clip": 0.011227, + "auxiliary_loss_mlp": 0.0108451, + "balance_loss_clip": 1.02962375, + "balance_loss_mlp": 1.00368643, + "epoch": 0.403174412312872, + "flos": 27089645506560.0, + "grad_norm": 3.8822161351625986, + "language_loss": 0.63915372, + "learning_rate": 2.708705505209726e-06, + "loss": 0.6612258, + "num_input_tokens_seen": 72458315, + "step": 3353, + "time_per_iteration": 2.851986885070801 + }, + { + "auxiliary_loss_clip": 0.01111018, + "auxiliary_loss_mlp": 0.01087421, + "balance_loss_clip": 1.03183091, + "balance_loss_mlp": 1.00659704, + "epoch": 0.4032946552035111, + "flos": 21756336065280.0, + "grad_norm": 1.9294563026562062, + "language_loss": 0.91814315, + "learning_rate": 2.7079770258737646e-06, + "loss": 0.94012755, + "num_input_tokens_seen": 72476225, + "step": 3354, + "time_per_iteration": 2.8851022720336914 + }, + { + "auxiliary_loss_clip": 0.01119689, + "auxiliary_loss_mlp": 0.01086468, + "balance_loss_clip": 1.03211713, + "balance_loss_mlp": 1.00550103, + "epoch": 0.4034148980941502, + "flos": 17343956448000.0, + "grad_norm": 2.120414951072218, + "language_loss": 0.75414395, + "learning_rate": 2.707248439138251e-06, + "loss": 0.77620548, + "num_input_tokens_seen": 72492460, + "step": 3355, + "time_per_iteration": 2.823011636734009 + }, + { + "auxiliary_loss_clip": 0.01125652, + "auxiliary_loss_mlp": 0.01085499, + "balance_loss_clip": 1.03199553, + "balance_loss_mlp": 1.00472307, + "epoch": 0.40353514098478926, + "flos": 22017838055040.0, + "grad_norm": 1.6700607066901285, + "language_loss": 0.65285993, + "learning_rate": 2.7065197451137114e-06, + "loss": 0.67497146, + "num_input_tokens_seen": 72513840, + "step": 3356, + "time_per_iteration": 2.8634865283966064 + }, + { + "auxiliary_loss_clip": 0.01125016, + "auxiliary_loss_mlp": 0.01085702, + "balance_loss_clip": 1.0306983, + "balance_loss_mlp": 1.00492585, + "epoch": 0.4036553838754284, + "flos": 14246446089600.0, + "grad_norm": 1.9279061281526946, + "language_loss": 0.67398286, + "learning_rate": 2.7057909439106894e-06, + "loss": 0.69609004, + "num_input_tokens_seen": 72531695, + "step": 3357, + "time_per_iteration": 2.7273764610290527 + }, + { + "auxiliary_loss_clip": 0.01138085, + "auxiliary_loss_mlp": 0.0087328, + "balance_loss_clip": 1.03299451, + "balance_loss_mlp": 1.00036979, + "epoch": 0.40377562676606743, + "flos": 24790644443520.0, + "grad_norm": 1.6816720792661075, + "language_loss": 0.77963936, + "learning_rate": 2.7050620356397417e-06, + "loss": 0.79975301, + "num_input_tokens_seen": 72550645, + "step": 3358, + "time_per_iteration": 2.7652459144592285 + }, + { + "auxiliary_loss_clip": 0.01147824, + "auxiliary_loss_mlp": 0.01085639, + "balance_loss_clip": 1.03496075, + "balance_loss_mlp": 1.00505328, + "epoch": 0.40389586965670654, + "flos": 24061226958720.0, + "grad_norm": 1.8144362665244955, + "language_loss": 0.7221303, + "learning_rate": 2.7043330204114437e-06, + "loss": 0.74446487, + "num_input_tokens_seen": 72569355, + "step": 3359, + "time_per_iteration": 2.7369816303253174 + }, + { + "auxiliary_loss_clip": 0.01145319, + "auxiliary_loss_mlp": 0.01085231, + "balance_loss_clip": 1.03263974, + "balance_loss_mlp": 1.00450253, + "epoch": 0.40401611254734565, + "flos": 16399613934720.0, + "grad_norm": 2.020456982558002, + "language_loss": 0.85419202, + "learning_rate": 2.7036038983363862e-06, + "loss": 0.87649751, + "num_input_tokens_seen": 72585960, + "step": 3360, + "time_per_iteration": 2.736556053161621 + }, + { + "auxiliary_loss_clip": 0.01135562, + "auxiliary_loss_mlp": 0.01084589, + "balance_loss_clip": 1.03234053, + "balance_loss_mlp": 1.0039084, + "epoch": 0.4041363554379847, + "flos": 23988220565760.0, + "grad_norm": 1.7172396618642012, + "language_loss": 0.84408844, + "learning_rate": 2.702874669525177e-06, + "loss": 0.86628997, + "num_input_tokens_seen": 72604440, + "step": 3361, + "time_per_iteration": 2.8069229125976562 + }, + { + "auxiliary_loss_clip": 0.01112978, + "auxiliary_loss_mlp": 0.01086023, + "balance_loss_clip": 1.02804136, + "balance_loss_mlp": 1.00534213, + "epoch": 0.4042565983286238, + "flos": 28401964899840.0, + "grad_norm": 2.0088688978274565, + "language_loss": 0.69720358, + "learning_rate": 2.7021453340884394e-06, + "loss": 0.71919358, + "num_input_tokens_seen": 72622165, + "step": 3362, + "time_per_iteration": 2.8868954181671143 + }, + { + "auxiliary_loss_clip": 0.01126925, + "auxiliary_loss_mlp": 0.00873224, + "balance_loss_clip": 1.03211164, + "balance_loss_mlp": 1.00036049, + "epoch": 0.40437684121926293, + "flos": 17710963660800.0, + "grad_norm": 1.9370523726402813, + "language_loss": 0.73029703, + "learning_rate": 2.7014158921368125e-06, + "loss": 0.7502985, + "num_input_tokens_seen": 72640490, + "step": 3363, + "time_per_iteration": 2.8163018226623535 + }, + { + "auxiliary_loss_clip": 0.01146865, + "auxiliary_loss_mlp": 0.01086506, + "balance_loss_clip": 1.0337956, + "balance_loss_mlp": 1.00572944, + "epoch": 0.404497084109902, + "flos": 24018959629440.0, + "grad_norm": 1.713604223217385, + "language_loss": 0.85079879, + "learning_rate": 2.700686343780953e-06, + "loss": 0.87313247, + "num_input_tokens_seen": 72660360, + "step": 3364, + "time_per_iteration": 2.7105774879455566 + }, + { + "auxiliary_loss_clip": 0.01125071, + "auxiliary_loss_mlp": 0.01085118, + "balance_loss_clip": 1.02938497, + "balance_loss_mlp": 1.00434232, + "epoch": 0.4046173270005411, + "flos": 22929861306240.0, + "grad_norm": 1.713472728741095, + "language_loss": 0.88391185, + "learning_rate": 2.699956689131532e-06, + "loss": 0.90601373, + "num_input_tokens_seen": 72680345, + "step": 3365, + "time_per_iteration": 2.896848440170288 + }, + { + "auxiliary_loss_clip": 0.01111314, + "auxiliary_loss_mlp": 0.01084698, + "balance_loss_clip": 1.03137827, + "balance_loss_mlp": 1.00392175, + "epoch": 0.4047375698911802, + "flos": 20668135582080.0, + "grad_norm": 2.515820304416286, + "language_loss": 0.84850764, + "learning_rate": 2.699226928299238e-06, + "loss": 0.87046778, + "num_input_tokens_seen": 72698365, + "step": 3366, + "time_per_iteration": 2.767643928527832 + }, + { + "auxiliary_loss_clip": 0.01136888, + "auxiliary_loss_mlp": 0.01087073, + "balance_loss_clip": 1.03253818, + "balance_loss_mlp": 1.00634432, + "epoch": 0.40485781278181926, + "flos": 28912865996160.0, + "grad_norm": 2.7599138347852814, + "language_loss": 0.78285879, + "learning_rate": 2.698497061394774e-06, + "loss": 0.80509841, + "num_input_tokens_seen": 72716850, + "step": 3367, + "time_per_iteration": 3.7507224082946777 + }, + { + "auxiliary_loss_clip": 0.01116929, + "auxiliary_loss_mlp": 0.00873193, + "balance_loss_clip": 1.03013062, + "balance_loss_mlp": 1.00029719, + "epoch": 0.40497805567245837, + "flos": 23148377694720.0, + "grad_norm": 1.5376959888656148, + "language_loss": 0.80807167, + "learning_rate": 2.6977670885288627e-06, + "loss": 0.82797289, + "num_input_tokens_seen": 72738250, + "step": 3368, + "time_per_iteration": 2.924861192703247 + }, + { + "auxiliary_loss_clip": 0.01126011, + "auxiliary_loss_mlp": 0.01084965, + "balance_loss_clip": 1.03103161, + "balance_loss_mlp": 1.00409389, + "epoch": 0.4050982985630975, + "flos": 16289404030080.0, + "grad_norm": 1.8696228327238515, + "language_loss": 0.75070584, + "learning_rate": 2.6970370098122378e-06, + "loss": 0.77281564, + "num_input_tokens_seen": 72755235, + "step": 3369, + "time_per_iteration": 2.771725654602051 + }, + { + "auxiliary_loss_clip": 0.01145301, + "auxiliary_loss_mlp": 0.01085972, + "balance_loss_clip": 1.03261256, + "balance_loss_mlp": 1.00514781, + "epoch": 0.40521854145373654, + "flos": 34459484353920.0, + "grad_norm": 1.5336915132906603, + "language_loss": 0.86369574, + "learning_rate": 2.6963068253556535e-06, + "loss": 0.88600844, + "num_input_tokens_seen": 72776620, + "step": 3370, + "time_per_iteration": 2.915158987045288 + }, + { + "auxiliary_loss_clip": 0.0112224, + "auxiliary_loss_mlp": 0.01085632, + "balance_loss_clip": 1.03208542, + "balance_loss_mlp": 1.00471234, + "epoch": 0.40533878434437565, + "flos": 25331099454720.0, + "grad_norm": 2.0294361234553557, + "language_loss": 0.85561562, + "learning_rate": 2.6955765352698763e-06, + "loss": 0.87769431, + "num_input_tokens_seen": 72796765, + "step": 3371, + "time_per_iteration": 4.169870376586914 + }, + { + "auxiliary_loss_clip": 0.01146434, + "auxiliary_loss_mlp": 0.01086899, + "balance_loss_clip": 1.03307617, + "balance_loss_mlp": 1.00597966, + "epoch": 0.40545902723501476, + "flos": 15012061505280.0, + "grad_norm": 2.207952679417558, + "language_loss": 0.73335481, + "learning_rate": 2.6948461396656923e-06, + "loss": 0.75568819, + "num_input_tokens_seen": 72814175, + "step": 3372, + "time_per_iteration": 2.6602694988250732 + }, + { + "auxiliary_loss_clip": 0.01123331, + "auxiliary_loss_mlp": 0.01086466, + "balance_loss_clip": 1.03339982, + "balance_loss_mlp": 1.0057379, + "epoch": 0.4055792701256538, + "flos": 25521103422720.0, + "grad_norm": 2.1514438233544904, + "language_loss": 0.74373043, + "learning_rate": 2.6941156386539013e-06, + "loss": 0.76582837, + "num_input_tokens_seen": 72834125, + "step": 3373, + "time_per_iteration": 2.8520750999450684 + }, + { + "auxiliary_loss_clip": 0.01123964, + "auxiliary_loss_mlp": 0.01085828, + "balance_loss_clip": 1.03027272, + "balance_loss_mlp": 1.00505233, + "epoch": 0.4056995130162929, + "flos": 19574583972480.0, + "grad_norm": 2.311993131280342, + "language_loss": 0.80805969, + "learning_rate": 2.6933850323453203e-06, + "loss": 0.83015764, + "num_input_tokens_seen": 72852570, + "step": 3374, + "time_per_iteration": 3.7994132041931152 + }, + { + "auxiliary_loss_clip": 0.01146904, + "auxiliary_loss_mlp": 0.01085355, + "balance_loss_clip": 1.03351128, + "balance_loss_mlp": 1.00467384, + "epoch": 0.405819755906932, + "flos": 15413794191360.0, + "grad_norm": 1.7415464898699098, + "language_loss": 0.7467978, + "learning_rate": 2.6926543208507806e-06, + "loss": 0.7691204, + "num_input_tokens_seen": 72871250, + "step": 3375, + "time_per_iteration": 2.768873691558838 + }, + { + "auxiliary_loss_clip": 0.01134875, + "auxiliary_loss_mlp": 0.01085975, + "balance_loss_clip": 1.03153872, + "balance_loss_mlp": 1.00510371, + "epoch": 0.4059399987975711, + "flos": 21433930565760.0, + "grad_norm": 2.1353549221861665, + "language_loss": 0.79886258, + "learning_rate": 2.6919235042811316e-06, + "loss": 0.82107103, + "num_input_tokens_seen": 72890035, + "step": 3376, + "time_per_iteration": 2.8141021728515625 + }, + { + "auxiliary_loss_clip": 0.0111024, + "auxiliary_loss_mlp": 0.01086746, + "balance_loss_clip": 1.02869678, + "balance_loss_mlp": 1.00582671, + "epoch": 0.4060602416882102, + "flos": 25556942217600.0, + "grad_norm": 2.8274358740748715, + "language_loss": 0.76601303, + "learning_rate": 2.691192582747237e-06, + "loss": 0.78798282, + "num_input_tokens_seen": 72909665, + "step": 3377, + "time_per_iteration": 3.7783451080322266 + }, + { + "auxiliary_loss_clip": 0.01145523, + "auxiliary_loss_mlp": 0.01086465, + "balance_loss_clip": 1.0328635, + "balance_loss_mlp": 1.00568855, + "epoch": 0.40618048457884925, + "flos": 23766759262080.0, + "grad_norm": 1.697204862762869, + "language_loss": 0.73893237, + "learning_rate": 2.6904615563599765e-06, + "loss": 0.76125222, + "num_input_tokens_seen": 72929465, + "step": 3378, + "time_per_iteration": 2.7585926055908203 + }, + { + "auxiliary_loss_clip": 0.01109608, + "auxiliary_loss_mlp": 0.0108577, + "balance_loss_clip": 1.02918816, + "balance_loss_mlp": 1.00508893, + "epoch": 0.40630072746948837, + "flos": 17639681120640.0, + "grad_norm": 1.6800127252410446, + "language_loss": 0.83510947, + "learning_rate": 2.6897304252302477e-06, + "loss": 0.85706323, + "num_input_tokens_seen": 72946785, + "step": 3379, + "time_per_iteration": 2.815610647201538 + }, + { + "auxiliary_loss_clip": 0.01108385, + "auxiliary_loss_mlp": 0.01080119, + "balance_loss_clip": 1.0357995, + "balance_loss_mlp": 1.00077367, + "epoch": 0.4064209703601275, + "flos": 60836053063680.0, + "grad_norm": 0.7901384657763046, + "language_loss": 0.54814559, + "learning_rate": 2.688999189468962e-06, + "loss": 0.57003063, + "num_input_tokens_seen": 73003215, + "step": 3380, + "time_per_iteration": 3.210453748703003 + }, + { + "auxiliary_loss_clip": 0.01131122, + "auxiliary_loss_mlp": 0.01087543, + "balance_loss_clip": 1.02905965, + "balance_loss_mlp": 1.00667143, + "epoch": 0.40654121325076653, + "flos": 24024346669440.0, + "grad_norm": 2.243604362098844, + "language_loss": 0.75745428, + "learning_rate": 2.6882678491870464e-06, + "loss": 0.77964091, + "num_input_tokens_seen": 73023650, + "step": 3381, + "time_per_iteration": 2.8048033714294434 + }, + { + "auxiliary_loss_clip": 0.01137262, + "auxiliary_loss_mlp": 0.01086793, + "balance_loss_clip": 1.03253603, + "balance_loss_mlp": 1.00611234, + "epoch": 0.40666145614140564, + "flos": 27344252085120.0, + "grad_norm": 1.7503389649714816, + "language_loss": 0.71494496, + "learning_rate": 2.6875364044954453e-06, + "loss": 0.73718554, + "num_input_tokens_seen": 73043880, + "step": 3382, + "time_per_iteration": 2.757061719894409 + }, + { + "auxiliary_loss_clip": 0.01128669, + "auxiliary_loss_mlp": 0.01085299, + "balance_loss_clip": 1.03201544, + "balance_loss_mlp": 1.00461829, + "epoch": 0.40678169903204475, + "flos": 26176724415360.0, + "grad_norm": 1.605892116026781, + "language_loss": 0.82412225, + "learning_rate": 2.6868048555051185e-06, + "loss": 0.84626198, + "num_input_tokens_seen": 73065410, + "step": 3383, + "time_per_iteration": 2.8341307640075684 + }, + { + "auxiliary_loss_clip": 0.01128764, + "auxiliary_loss_mlp": 0.01086768, + "balance_loss_clip": 1.03151262, + "balance_loss_mlp": 1.00599182, + "epoch": 0.4069019419226838, + "flos": 28622420622720.0, + "grad_norm": 2.673800072495227, + "language_loss": 0.85266697, + "learning_rate": 2.686073202327041e-06, + "loss": 0.8748222, + "num_input_tokens_seen": 73084410, + "step": 3384, + "time_per_iteration": 2.802414894104004 + }, + { + "auxiliary_loss_clip": 0.01128654, + "auxiliary_loss_mlp": 0.01085915, + "balance_loss_clip": 1.03186202, + "balance_loss_mlp": 1.00532961, + "epoch": 0.4070221848133229, + "flos": 25229006023680.0, + "grad_norm": 2.103691698332427, + "language_loss": 0.73777604, + "learning_rate": 2.6853414450722043e-06, + "loss": 0.75992173, + "num_input_tokens_seen": 73104075, + "step": 3385, + "time_per_iteration": 2.7887039184570312 + }, + { + "auxiliary_loss_clip": 0.01133828, + "auxiliary_loss_mlp": 0.01085315, + "balance_loss_clip": 1.03015256, + "balance_loss_mlp": 1.00463414, + "epoch": 0.40714242770396203, + "flos": 18405224709120.0, + "grad_norm": 1.7222187990635482, + "language_loss": 0.85522783, + "learning_rate": 2.684609583851616e-06, + "loss": 0.87741923, + "num_input_tokens_seen": 73122250, + "step": 3386, + "time_per_iteration": 2.717883825302124 + }, + { + "auxiliary_loss_clip": 0.01104799, + "auxiliary_loss_mlp": 0.01085651, + "balance_loss_clip": 1.02818668, + "balance_loss_mlp": 1.00487423, + "epoch": 0.4072626705946011, + "flos": 30228920403840.0, + "grad_norm": 1.5695300147822429, + "language_loss": 0.80884206, + "learning_rate": 2.683877618776297e-06, + "loss": 0.83074653, + "num_input_tokens_seen": 73144505, + "step": 3387, + "time_per_iteration": 2.9194729328155518 + }, + { + "auxiliary_loss_clip": 0.01129608, + "auxiliary_loss_mlp": 0.01085287, + "balance_loss_clip": 1.03253126, + "balance_loss_mlp": 1.0044632, + "epoch": 0.4073829134852402, + "flos": 21834549930240.0, + "grad_norm": 2.0376681910427377, + "language_loss": 0.74159551, + "learning_rate": 2.6831455499572876e-06, + "loss": 0.76374447, + "num_input_tokens_seen": 73162440, + "step": 3388, + "time_per_iteration": 2.7845618724823 + }, + { + "auxiliary_loss_clip": 0.01146101, + "auxiliary_loss_mlp": 0.01085047, + "balance_loss_clip": 1.03318167, + "balance_loss_mlp": 1.00427079, + "epoch": 0.40750315637587925, + "flos": 25260211964160.0, + "grad_norm": 1.828828823300905, + "language_loss": 0.77667165, + "learning_rate": 2.682413377505641e-06, + "loss": 0.7989831, + "num_input_tokens_seen": 73181245, + "step": 3389, + "time_per_iteration": 2.688101053237915 + }, + { + "auxiliary_loss_clip": 0.01134768, + "auxiliary_loss_mlp": 0.01087742, + "balance_loss_clip": 1.03089869, + "balance_loss_mlp": 1.0070138, + "epoch": 0.40762339926651836, + "flos": 19712767593600.0, + "grad_norm": 2.7082729562329533, + "language_loss": 0.76747972, + "learning_rate": 2.6816811015324284e-06, + "loss": 0.78970486, + "num_input_tokens_seen": 73199295, + "step": 3390, + "time_per_iteration": 2.775897979736328 + }, + { + "auxiliary_loss_clip": 0.01135777, + "auxiliary_loss_mlp": 0.01079357, + "balance_loss_clip": 1.03811407, + "balance_loss_mlp": 1.00001109, + "epoch": 0.40774364215715747, + "flos": 71449307314560.0, + "grad_norm": 0.7267363473908411, + "language_loss": 0.56726366, + "learning_rate": 2.6809487221487343e-06, + "loss": 0.58941501, + "num_input_tokens_seen": 73258780, + "step": 3391, + "time_per_iteration": 3.157111406326294 + }, + { + "auxiliary_loss_clip": 0.01136243, + "auxiliary_loss_mlp": 0.0108537, + "balance_loss_clip": 1.03110933, + "balance_loss_mlp": 1.00464129, + "epoch": 0.4078638850477965, + "flos": 15084134144640.0, + "grad_norm": 2.245556730834614, + "language_loss": 0.81904638, + "learning_rate": 2.6802162394656605e-06, + "loss": 0.84126246, + "num_input_tokens_seen": 73275490, + "step": 3392, + "time_per_iteration": 3.623080253601074 + }, + { + "auxiliary_loss_clip": 0.01125071, + "auxiliary_loss_mlp": 0.01085477, + "balance_loss_clip": 1.02985764, + "balance_loss_mlp": 1.00479627, + "epoch": 0.40798412793843564, + "flos": 23842890138240.0, + "grad_norm": 1.9563069878182207, + "language_loss": 0.71692532, + "learning_rate": 2.679483653594324e-06, + "loss": 0.73903084, + "num_input_tokens_seen": 73297260, + "step": 3393, + "time_per_iteration": 2.862344741821289 + }, + { + "auxiliary_loss_clip": 0.01135793, + "auxiliary_loss_mlp": 0.01086118, + "balance_loss_clip": 1.03119016, + "balance_loss_mlp": 1.00548506, + "epoch": 0.40810437082907475, + "flos": 21065774117760.0, + "grad_norm": 2.3584985023279974, + "language_loss": 0.76577109, + "learning_rate": 2.678750964645857e-06, + "loss": 0.78799021, + "num_input_tokens_seen": 73316340, + "step": 3394, + "time_per_iteration": 2.721190929412842 + }, + { + "auxiliary_loss_clip": 0.01130879, + "auxiliary_loss_mlp": 0.01086652, + "balance_loss_clip": 1.02893066, + "balance_loss_mlp": 1.00573313, + "epoch": 0.4082246137197138, + "flos": 11321377948800.0, + "grad_norm": 2.3420019816217525, + "language_loss": 0.83715957, + "learning_rate": 2.6780181727314094e-06, + "loss": 0.85933495, + "num_input_tokens_seen": 73331245, + "step": 3395, + "time_per_iteration": 2.6770622730255127 + }, + { + "auxiliary_loss_clip": 0.01117357, + "auxiliary_loss_mlp": 0.00873177, + "balance_loss_clip": 1.03012943, + "balance_loss_mlp": 1.00034237, + "epoch": 0.4083448566103529, + "flos": 19062569554560.0, + "grad_norm": 1.6221309704732145, + "language_loss": 0.77491546, + "learning_rate": 2.6772852779621435e-06, + "loss": 0.79482085, + "num_input_tokens_seen": 73349105, + "step": 3396, + "time_per_iteration": 2.7982897758483887 + }, + { + "auxiliary_loss_clip": 0.0112948, + "auxiliary_loss_mlp": 0.00873118, + "balance_loss_clip": 1.02823985, + "balance_loss_mlp": 1.0003289, + "epoch": 0.408465099500992, + "flos": 23550254035200.0, + "grad_norm": 1.7832237724585782, + "language_loss": 0.8671006, + "learning_rate": 2.676552280449239e-06, + "loss": 0.88712656, + "num_input_tokens_seen": 73368990, + "step": 3397, + "time_per_iteration": 2.756286382675171 + }, + { + "auxiliary_loss_clip": 0.01137612, + "auxiliary_loss_mlp": 0.01085848, + "balance_loss_clip": 1.03290963, + "balance_loss_mlp": 1.00507188, + "epoch": 0.4085853423916311, + "flos": 12750012558720.0, + "grad_norm": 2.3029321786875268, + "language_loss": 0.75029933, + "learning_rate": 2.6758191803038917e-06, + "loss": 0.77253389, + "num_input_tokens_seen": 73387485, + "step": 3398, + "time_per_iteration": 3.7764534950256348 + }, + { + "auxiliary_loss_clip": 0.01088271, + "auxiliary_loss_mlp": 0.01086527, + "balance_loss_clip": 1.02607918, + "balance_loss_mlp": 1.00575137, + "epoch": 0.4087055852822702, + "flos": 24353072962560.0, + "grad_norm": 1.7696064973805614, + "language_loss": 0.83156025, + "learning_rate": 2.6750859776373125e-06, + "loss": 0.85330832, + "num_input_tokens_seen": 73406940, + "step": 3399, + "time_per_iteration": 4.031936407089233 + }, + { + "auxiliary_loss_clip": 0.01094623, + "auxiliary_loss_mlp": 0.01079519, + "balance_loss_clip": 1.03732979, + "balance_loss_mlp": 1.00017285, + "epoch": 0.4088258281729093, + "flos": 66387950720640.0, + "grad_norm": 0.7658794822400073, + "language_loss": 0.6038993, + "learning_rate": 2.674352672560727e-06, + "loss": 0.62564069, + "num_input_tokens_seen": 73468385, + "step": 3400, + "time_per_iteration": 3.679044723510742 + }, + { + "auxiliary_loss_clip": 0.01102296, + "auxiliary_loss_mlp": 0.010861, + "balance_loss_clip": 1.03047812, + "balance_loss_mlp": 1.00522852, + "epoch": 0.40894607106354836, + "flos": 20449260057600.0, + "grad_norm": 1.5626750564743008, + "language_loss": 0.76662487, + "learning_rate": 2.673619265185377e-06, + "loss": 0.78850883, + "num_input_tokens_seen": 73488225, + "step": 3401, + "time_per_iteration": 3.0240824222564697 + }, + { + "auxiliary_loss_clip": 0.01136087, + "auxiliary_loss_mlp": 0.01085927, + "balance_loss_clip": 1.03125346, + "balance_loss_mlp": 1.00519896, + "epoch": 0.40906631395418747, + "flos": 27053627143680.0, + "grad_norm": 1.5496203390265015, + "language_loss": 0.78312755, + "learning_rate": 2.672885755622521e-06, + "loss": 0.80534768, + "num_input_tokens_seen": 73510640, + "step": 3402, + "time_per_iteration": 2.7304248809814453 + }, + { + "auxiliary_loss_clip": 0.01102532, + "auxiliary_loss_mlp": 0.01085011, + "balance_loss_clip": 1.02648473, + "balance_loss_mlp": 1.00433028, + "epoch": 0.4091865568448266, + "flos": 25484151306240.0, + "grad_norm": 2.0522208697555118, + "language_loss": 0.70338809, + "learning_rate": 2.67215214398343e-06, + "loss": 0.72526348, + "num_input_tokens_seen": 73530655, + "step": 3403, + "time_per_iteration": 3.769556999206543 + }, + { + "auxiliary_loss_clip": 0.01107528, + "auxiliary_loss_mlp": 0.01085876, + "balance_loss_clip": 1.02866197, + "balance_loss_mlp": 1.0051477, + "epoch": 0.40930679973546563, + "flos": 28657864368000.0, + "grad_norm": 2.3417797295879836, + "language_loss": 0.78104675, + "learning_rate": 2.671418430379393e-06, + "loss": 0.80298078, + "num_input_tokens_seen": 73549340, + "step": 3404, + "time_per_iteration": 2.9690489768981934 + }, + { + "auxiliary_loss_clip": 0.01145346, + "auxiliary_loss_mlp": 0.01084759, + "balance_loss_clip": 1.03187108, + "balance_loss_mlp": 1.00403023, + "epoch": 0.40942704262610474, + "flos": 20886292834560.0, + "grad_norm": 1.8572254915911577, + "language_loss": 0.83507943, + "learning_rate": 2.670684614921715e-06, + "loss": 0.85738045, + "num_input_tokens_seen": 73568315, + "step": 3405, + "time_per_iteration": 2.6515448093414307 + }, + { + "auxiliary_loss_clip": 0.01128737, + "auxiliary_loss_mlp": 0.01087177, + "balance_loss_clip": 1.03181386, + "balance_loss_mlp": 1.00644863, + "epoch": 0.4095472855167438, + "flos": 21618080616960.0, + "grad_norm": 2.0823684574110213, + "language_loss": 0.69343036, + "learning_rate": 2.6699506977217128e-06, + "loss": 0.71558952, + "num_input_tokens_seen": 73588490, + "step": 3406, + "time_per_iteration": 2.8077924251556396 + }, + { + "auxiliary_loss_clip": 0.0113559, + "auxiliary_loss_mlp": 0.0108562, + "balance_loss_clip": 1.03187001, + "balance_loss_mlp": 1.00493944, + "epoch": 0.4096675284073829, + "flos": 27926112499200.0, + "grad_norm": 1.878122487402352, + "language_loss": 0.7035374, + "learning_rate": 2.6692166788907233e-06, + "loss": 0.72574955, + "num_input_tokens_seen": 73608685, + "step": 3407, + "time_per_iteration": 2.72282075881958 + }, + { + "auxiliary_loss_clip": 0.01125488, + "auxiliary_loss_mlp": 0.01086096, + "balance_loss_clip": 1.02989364, + "balance_loss_mlp": 1.00531971, + "epoch": 0.409787771298022, + "flos": 19206607092480.0, + "grad_norm": 2.8151311316192937, + "language_loss": 0.76829582, + "learning_rate": 2.6684825585400957e-06, + "loss": 0.79041171, + "num_input_tokens_seen": 73627630, + "step": 3408, + "time_per_iteration": 2.7494707107543945 + }, + { + "auxiliary_loss_clip": 0.01117144, + "auxiliary_loss_mlp": 0.01079322, + "balance_loss_clip": 1.03642988, + "balance_loss_mlp": 0.99997658, + "epoch": 0.4099080141886611, + "flos": 59269234832640.0, + "grad_norm": 0.8164324761104292, + "language_loss": 0.65078509, + "learning_rate": 2.6677483367811947e-06, + "loss": 0.67274976, + "num_input_tokens_seen": 73687670, + "step": 3409, + "time_per_iteration": 3.472046375274658 + }, + { + "auxiliary_loss_clip": 0.01135054, + "auxiliary_loss_mlp": 0.01085503, + "balance_loss_clip": 1.02989066, + "balance_loss_mlp": 1.00477457, + "epoch": 0.4100282570793002, + "flos": 21906443001600.0, + "grad_norm": 1.7465976057367885, + "language_loss": 0.75685209, + "learning_rate": 2.6670140137254028e-06, + "loss": 0.77905768, + "num_input_tokens_seen": 73707145, + "step": 3410, + "time_per_iteration": 2.6901206970214844 + }, + { + "auxiliary_loss_clip": 0.01106129, + "auxiliary_loss_mlp": 0.01086001, + "balance_loss_clip": 1.02842128, + "balance_loss_mlp": 1.00532031, + "epoch": 0.4101484999699393, + "flos": 18551596631040.0, + "grad_norm": 2.177723223373164, + "language_loss": 0.89304459, + "learning_rate": 2.666279589484115e-06, + "loss": 0.91496593, + "num_input_tokens_seen": 73725045, + "step": 3411, + "time_per_iteration": 2.8139243125915527 + }, + { + "auxiliary_loss_clip": 0.01111158, + "auxiliary_loss_mlp": 0.01086372, + "balance_loss_clip": 1.03102612, + "balance_loss_mlp": 1.00559533, + "epoch": 0.41026874286057835, + "flos": 19094529680640.0, + "grad_norm": 1.7182009047079516, + "language_loss": 0.81150264, + "learning_rate": 2.6655450641687435e-06, + "loss": 0.83347791, + "num_input_tokens_seen": 73742610, + "step": 3412, + "time_per_iteration": 2.8259191513061523 + }, + { + "auxiliary_loss_clip": 0.01146078, + "auxiliary_loss_mlp": 0.01086533, + "balance_loss_clip": 1.03323257, + "balance_loss_mlp": 1.00585175, + "epoch": 0.41038898575121746, + "flos": 31209568588800.0, + "grad_norm": 1.613773444973832, + "language_loss": 0.69416893, + "learning_rate": 2.664810437890715e-06, + "loss": 0.71649504, + "num_input_tokens_seen": 73764280, + "step": 3413, + "time_per_iteration": 2.769179344177246 + }, + { + "auxiliary_loss_clip": 0.01091971, + "auxiliary_loss_mlp": 0.01084878, + "balance_loss_clip": 1.02819145, + "balance_loss_mlp": 1.00429273, + "epoch": 0.41050922864185657, + "flos": 14355865895040.0, + "grad_norm": 2.8741567761982845, + "language_loss": 0.8003751, + "learning_rate": 2.6640757107614714e-06, + "loss": 0.82214361, + "num_input_tokens_seen": 73782375, + "step": 3414, + "time_per_iteration": 2.7999050617218018 + }, + { + "auxiliary_loss_clip": 0.01109155, + "auxiliary_loss_mlp": 0.01087126, + "balance_loss_clip": 1.0250473, + "balance_loss_mlp": 1.00639772, + "epoch": 0.4106294715324956, + "flos": 30956290813440.0, + "grad_norm": 1.990942961071586, + "language_loss": 0.68963277, + "learning_rate": 2.6633408828924697e-06, + "loss": 0.71159559, + "num_input_tokens_seen": 73801240, + "step": 3415, + "time_per_iteration": 2.9340856075286865 + }, + { + "auxiliary_loss_clip": 0.01120176, + "auxiliary_loss_mlp": 0.01084569, + "balance_loss_clip": 1.03188634, + "balance_loss_mlp": 1.00388813, + "epoch": 0.41074971442313474, + "flos": 24457321209600.0, + "grad_norm": 1.5411370741444308, + "language_loss": 0.70006198, + "learning_rate": 2.662605954395185e-06, + "loss": 0.72210944, + "num_input_tokens_seen": 73821200, + "step": 3416, + "time_per_iteration": 2.789252758026123 + }, + { + "auxiliary_loss_clip": 0.01136139, + "auxiliary_loss_mlp": 0.01086159, + "balance_loss_clip": 1.03107214, + "balance_loss_mlp": 1.00543082, + "epoch": 0.41086995731377385, + "flos": 21542991235200.0, + "grad_norm": 2.307926032105126, + "language_loss": 0.8387984, + "learning_rate": 2.6618709253811027e-06, + "loss": 0.8610214, + "num_input_tokens_seen": 73840655, + "step": 3417, + "time_per_iteration": 3.643225908279419 + }, + { + "auxiliary_loss_clip": 0.01144978, + "auxiliary_loss_mlp": 0.01084928, + "balance_loss_clip": 1.03293514, + "balance_loss_mlp": 1.00448537, + "epoch": 0.4109902002044129, + "flos": 20702753314560.0, + "grad_norm": 1.5405651589805727, + "language_loss": 0.88040864, + "learning_rate": 2.6611357959617277e-06, + "loss": 0.9027077, + "num_input_tokens_seen": 73860275, + "step": 3418, + "time_per_iteration": 2.6930770874023438 + }, + { + "auxiliary_loss_clip": 0.0111562, + "auxiliary_loss_mlp": 0.01086835, + "balance_loss_clip": 1.02914274, + "balance_loss_mlp": 1.00610602, + "epoch": 0.411110443095052, + "flos": 18179992477440.0, + "grad_norm": 1.739544320069025, + "language_loss": 0.91434968, + "learning_rate": 2.660400566248578e-06, + "loss": 0.93637431, + "num_input_tokens_seen": 73878400, + "step": 3419, + "time_per_iteration": 2.7587027549743652 + }, + { + "auxiliary_loss_clip": 0.011139, + "auxiliary_loss_mlp": 0.01086602, + "balance_loss_clip": 1.02814698, + "balance_loss_mlp": 1.00563467, + "epoch": 0.41123068598569107, + "flos": 14575244209920.0, + "grad_norm": 2.092855461010811, + "language_loss": 0.66757518, + "learning_rate": 2.6596652363531876e-06, + "loss": 0.6895802, + "num_input_tokens_seen": 73894275, + "step": 3420, + "time_per_iteration": 2.875162363052368 + }, + { + "auxiliary_loss_clip": 0.01146149, + "auxiliary_loss_mlp": 0.01085166, + "balance_loss_clip": 1.03342104, + "balance_loss_mlp": 1.00448549, + "epoch": 0.4113509288763302, + "flos": 21177995184000.0, + "grad_norm": 1.566211203657366, + "language_loss": 0.78277868, + "learning_rate": 2.6589298063871055e-06, + "loss": 0.80509186, + "num_input_tokens_seen": 73914450, + "step": 3421, + "time_per_iteration": 2.6172971725463867 + }, + { + "auxiliary_loss_clip": 0.0114438, + "auxiliary_loss_mlp": 0.01085413, + "balance_loss_clip": 1.03232789, + "balance_loss_mlp": 1.00463676, + "epoch": 0.4114711717669693, + "flos": 18442212739200.0, + "grad_norm": 1.9716566903699093, + "language_loss": 0.69812346, + "learning_rate": 2.658194276461895e-06, + "loss": 0.72042137, + "num_input_tokens_seen": 73932375, + "step": 3422, + "time_per_iteration": 2.6680994033813477 + }, + { + "auxiliary_loss_clip": 0.01127556, + "auxiliary_loss_mlp": 0.01087873, + "balance_loss_clip": 1.03001165, + "balance_loss_mlp": 1.00690579, + "epoch": 0.41159141465760835, + "flos": 27233395735680.0, + "grad_norm": 1.9384767540616004, + "language_loss": 0.66770869, + "learning_rate": 2.6574586466891368e-06, + "loss": 0.68986297, + "num_input_tokens_seen": 73952850, + "step": 3423, + "time_per_iteration": 3.809004306793213 + }, + { + "auxiliary_loss_clip": 0.01125759, + "auxiliary_loss_mlp": 0.00873134, + "balance_loss_clip": 1.03028297, + "balance_loss_mlp": 1.00046301, + "epoch": 0.41171165754824746, + "flos": 20006876154240.0, + "grad_norm": 2.1996854310062672, + "language_loss": 0.64586377, + "learning_rate": 2.6567229171804247e-06, + "loss": 0.66585267, + "num_input_tokens_seen": 73970735, + "step": 3424, + "time_per_iteration": 2.7311480045318604 + }, + { + "auxiliary_loss_clip": 0.011165, + "auxiliary_loss_mlp": 0.01085853, + "balance_loss_clip": 1.02929425, + "balance_loss_mlp": 1.00498128, + "epoch": 0.41183190043888657, + "flos": 18004318035840.0, + "grad_norm": 2.8595339477374386, + "language_loss": 0.87857252, + "learning_rate": 2.655987088047368e-06, + "loss": 0.90059602, + "num_input_tokens_seen": 73989080, + "step": 3425, + "time_per_iteration": 3.768855571746826 + }, + { + "auxiliary_loss_clip": 0.0112726, + "auxiliary_loss_mlp": 0.01086029, + "balance_loss_clip": 1.03031373, + "balance_loss_mlp": 1.00534856, + "epoch": 0.4119521433295256, + "flos": 27163370171520.0, + "grad_norm": 1.9011212975237846, + "language_loss": 0.78697121, + "learning_rate": 2.6552511594015912e-06, + "loss": 0.80910408, + "num_input_tokens_seen": 74009470, + "step": 3426, + "time_per_iteration": 2.824838399887085 + }, + { + "auxiliary_loss_clip": 0.01127479, + "auxiliary_loss_mlp": 0.0108707, + "balance_loss_clip": 1.03030205, + "balance_loss_mlp": 1.00610292, + "epoch": 0.41207238622016473, + "flos": 15122020014720.0, + "grad_norm": 1.8517448821727622, + "language_loss": 0.85097396, + "learning_rate": 2.654515131354735e-06, + "loss": 0.87311947, + "num_input_tokens_seen": 74027735, + "step": 3427, + "time_per_iteration": 2.768237590789795 + }, + { + "auxiliary_loss_clip": 0.01114547, + "auxiliary_loss_mlp": 0.01085925, + "balance_loss_clip": 1.02903581, + "balance_loss_mlp": 1.00524414, + "epoch": 0.41219262911080384, + "flos": 27052872958080.0, + "grad_norm": 2.4044837559604386, + "language_loss": 0.84746027, + "learning_rate": 2.653779004018453e-06, + "loss": 0.86946499, + "num_input_tokens_seen": 74048300, + "step": 3428, + "time_per_iteration": 3.7670390605926514 + }, + { + "auxiliary_loss_clip": 0.01127398, + "auxiliary_loss_mlp": 0.01085692, + "balance_loss_clip": 1.03145003, + "balance_loss_mlp": 1.00496352, + "epoch": 0.4123128720014429, + "flos": 24686360282880.0, + "grad_norm": 2.2574705864892097, + "language_loss": 0.82221514, + "learning_rate": 2.653042777504417e-06, + "loss": 0.84434605, + "num_input_tokens_seen": 74070890, + "step": 3429, + "time_per_iteration": 2.738074779510498 + }, + { + "auxiliary_loss_clip": 0.01110556, + "auxiliary_loss_mlp": 0.01085868, + "balance_loss_clip": 1.03004813, + "balance_loss_mlp": 1.00504386, + "epoch": 0.412433114892082, + "flos": 26244774731520.0, + "grad_norm": 1.8204447901167122, + "language_loss": 0.79665804, + "learning_rate": 2.6523064519243105e-06, + "loss": 0.81862223, + "num_input_tokens_seen": 74090460, + "step": 3430, + "time_per_iteration": 2.8206539154052734 + }, + { + "auxiliary_loss_clip": 0.01134092, + "auxiliary_loss_mlp": 0.01086511, + "balance_loss_clip": 1.0313307, + "balance_loss_mlp": 1.00559187, + "epoch": 0.4125533577827211, + "flos": 21361031913600.0, + "grad_norm": 2.184732426025695, + "language_loss": 0.79265141, + "learning_rate": 2.6515700273898333e-06, + "loss": 0.81485736, + "num_input_tokens_seen": 74108335, + "step": 3431, + "time_per_iteration": 2.7564995288848877 + }, + { + "auxiliary_loss_clip": 0.01120592, + "auxiliary_loss_mlp": 0.01086159, + "balance_loss_clip": 1.02698076, + "balance_loss_mlp": 1.0055263, + "epoch": 0.4126736006733602, + "flos": 26067556005120.0, + "grad_norm": 1.857823301941503, + "language_loss": 0.68933159, + "learning_rate": 2.6508335040127018e-06, + "loss": 0.71139914, + "num_input_tokens_seen": 74128030, + "step": 3432, + "time_per_iteration": 2.7938482761383057 + }, + { + "auxiliary_loss_clip": 0.01134964, + "auxiliary_loss_mlp": 0.0108536, + "balance_loss_clip": 1.03031707, + "balance_loss_mlp": 1.00458431, + "epoch": 0.4127938435639993, + "flos": 25666146541440.0, + "grad_norm": 1.4305687392791542, + "language_loss": 0.77222514, + "learning_rate": 2.6500968819046446e-06, + "loss": 0.79442835, + "num_input_tokens_seen": 74148330, + "step": 3433, + "time_per_iteration": 2.7724194526672363 + }, + { + "auxiliary_loss_clip": 0.01114583, + "auxiliary_loss_mlp": 0.01085647, + "balance_loss_clip": 1.02757001, + "balance_loss_mlp": 1.00496578, + "epoch": 0.4129140864546384, + "flos": 17995914253440.0, + "grad_norm": 2.228159400003661, + "language_loss": 0.59111089, + "learning_rate": 2.649360161177408e-06, + "loss": 0.61311316, + "num_input_tokens_seen": 74163390, + "step": 3434, + "time_per_iteration": 2.7498230934143066 + }, + { + "auxiliary_loss_clip": 0.01136787, + "auxiliary_loss_mlp": 0.01085804, + "balance_loss_clip": 1.03135455, + "balance_loss_mlp": 1.00498044, + "epoch": 0.41303432934527745, + "flos": 23732895715200.0, + "grad_norm": 1.7066168693549315, + "language_loss": 0.73213834, + "learning_rate": 2.6486233419427504e-06, + "loss": 0.75436425, + "num_input_tokens_seen": 74183205, + "step": 3435, + "time_per_iteration": 2.7542574405670166 + }, + { + "auxiliary_loss_clip": 0.01115861, + "auxiliary_loss_mlp": 0.01086256, + "balance_loss_clip": 1.02951956, + "balance_loss_mlp": 1.00543201, + "epoch": 0.41315457223591656, + "flos": 19755286318080.0, + "grad_norm": 1.867922476431562, + "language_loss": 0.75131971, + "learning_rate": 2.6478864243124484e-06, + "loss": 0.77334088, + "num_input_tokens_seen": 74202870, + "step": 3436, + "time_per_iteration": 2.771545171737671 + }, + { + "auxiliary_loss_clip": 0.01135871, + "auxiliary_loss_mlp": 0.01085515, + "balance_loss_clip": 1.03102076, + "balance_loss_mlp": 1.00478649, + "epoch": 0.4132748151265556, + "flos": 20923316778240.0, + "grad_norm": 1.6397192816002577, + "language_loss": 0.8508935, + "learning_rate": 2.6471494083982903e-06, + "loss": 0.87310731, + "num_input_tokens_seen": 74222255, + "step": 3437, + "time_per_iteration": 2.777855157852173 + }, + { + "auxiliary_loss_clip": 0.01117367, + "auxiliary_loss_mlp": 0.01086129, + "balance_loss_clip": 1.03040743, + "balance_loss_mlp": 1.00540054, + "epoch": 0.4133950580171947, + "flos": 32232520016640.0, + "grad_norm": 2.0624314761624603, + "language_loss": 0.75131404, + "learning_rate": 2.6464122943120818e-06, + "loss": 0.77334905, + "num_input_tokens_seen": 74242480, + "step": 3438, + "time_per_iteration": 2.8439085483551025 + }, + { + "auxiliary_loss_clip": 0.01113423, + "auxiliary_loss_mlp": 0.01086034, + "balance_loss_clip": 1.02881789, + "balance_loss_mlp": 1.00530553, + "epoch": 0.41351530090783384, + "flos": 23292487059840.0, + "grad_norm": 3.2395588742137917, + "language_loss": 0.82398558, + "learning_rate": 2.645675082165642e-06, + "loss": 0.84598011, + "num_input_tokens_seen": 74258690, + "step": 3439, + "time_per_iteration": 2.7878828048706055 + }, + { + "auxiliary_loss_clip": 0.01122206, + "auxiliary_loss_mlp": 0.01083533, + "balance_loss_clip": 1.02798355, + "balance_loss_mlp": 1.00290036, + "epoch": 0.4136355437984729, + "flos": 25593571111680.0, + "grad_norm": 2.1207084643466105, + "language_loss": 0.7524879, + "learning_rate": 2.644937772070806e-06, + "loss": 0.77454531, + "num_input_tokens_seen": 74277135, + "step": 3440, + "time_per_iteration": 2.7609705924987793 + }, + { + "auxiliary_loss_clip": 0.01145016, + "auxiliary_loss_mlp": 0.01084388, + "balance_loss_clip": 1.03223252, + "balance_loss_mlp": 1.00380278, + "epoch": 0.413755786689112, + "flos": 19828615933440.0, + "grad_norm": 2.2381585199694634, + "language_loss": 0.83559251, + "learning_rate": 2.6442003641394225e-06, + "loss": 0.85788655, + "num_input_tokens_seen": 74294730, + "step": 3441, + "time_per_iteration": 2.6308727264404297 + }, + { + "auxiliary_loss_clip": 0.01127225, + "auxiliary_loss_mlp": 0.01085827, + "balance_loss_clip": 1.02995062, + "balance_loss_mlp": 1.00514615, + "epoch": 0.4138760295797511, + "flos": 26870446759680.0, + "grad_norm": 1.530419669013532, + "language_loss": 0.83795869, + "learning_rate": 2.643462858483356e-06, + "loss": 0.86008924, + "num_input_tokens_seen": 74315015, + "step": 3442, + "time_per_iteration": 3.739232063293457 + }, + { + "auxiliary_loss_clip": 0.0110151, + "auxiliary_loss_mlp": 0.01086896, + "balance_loss_clip": 1.02510273, + "balance_loss_mlp": 1.00611949, + "epoch": 0.41399627247039017, + "flos": 16399254798720.0, + "grad_norm": 1.9936407427967022, + "language_loss": 0.72484171, + "learning_rate": 2.6427252552144856e-06, + "loss": 0.7467258, + "num_input_tokens_seen": 74333665, + "step": 3443, + "time_per_iteration": 2.7872745990753174 + }, + { + "auxiliary_loss_clip": 0.01144154, + "auxiliary_loss_mlp": 0.01084888, + "balance_loss_clip": 1.0317347, + "balance_loss_mlp": 1.0040642, + "epoch": 0.4141165153610293, + "flos": 22930220442240.0, + "grad_norm": 1.7983369538396374, + "language_loss": 0.74970984, + "learning_rate": 2.6419875544447044e-06, + "loss": 0.77200019, + "num_input_tokens_seen": 74355065, + "step": 3444, + "time_per_iteration": 2.7697346210479736 + }, + { + "auxiliary_loss_clip": 0.01144321, + "auxiliary_loss_mlp": 0.01085766, + "balance_loss_clip": 1.03104901, + "balance_loss_mlp": 1.00498939, + "epoch": 0.4142367582516684, + "flos": 25192556697600.0, + "grad_norm": 1.637161135806475, + "language_loss": 0.71585596, + "learning_rate": 2.6412497562859218e-06, + "loss": 0.73815691, + "num_input_tokens_seen": 74376345, + "step": 3445, + "time_per_iteration": 2.742438316345215 + }, + { + "auxiliary_loss_clip": 0.01136695, + "auxiliary_loss_mlp": 0.01085714, + "balance_loss_clip": 1.03133869, + "balance_loss_mlp": 1.00488973, + "epoch": 0.41435700114230745, + "flos": 21690476478720.0, + "grad_norm": 2.776017759549107, + "language_loss": 0.76283771, + "learning_rate": 2.6405118608500617e-06, + "loss": 0.78506178, + "num_input_tokens_seen": 74395170, + "step": 3446, + "time_per_iteration": 2.757286787033081 + }, + { + "auxiliary_loss_clip": 0.01111834, + "auxiliary_loss_mlp": 0.01084903, + "balance_loss_clip": 1.028409, + "balance_loss_mlp": 1.00426972, + "epoch": 0.41447724403294656, + "flos": 25995160143360.0, + "grad_norm": 1.7248207503307584, + "language_loss": 0.81245375, + "learning_rate": 2.6397738682490613e-06, + "loss": 0.8344211, + "num_input_tokens_seen": 74416070, + "step": 3447, + "time_per_iteration": 2.8737711906433105 + }, + { + "auxiliary_loss_clip": 0.01144844, + "auxiliary_loss_mlp": 0.01085671, + "balance_loss_clip": 1.03188801, + "balance_loss_mlp": 1.00498986, + "epoch": 0.41459748692358567, + "flos": 18259678800000.0, + "grad_norm": 1.909524082656789, + "language_loss": 0.75319457, + "learning_rate": 2.6390357785948734e-06, + "loss": 0.7754997, + "num_input_tokens_seen": 74433185, + "step": 3448, + "time_per_iteration": 2.640994071960449 + }, + { + "auxiliary_loss_clip": 0.01134223, + "auxiliary_loss_mlp": 0.010843, + "balance_loss_clip": 1.03086126, + "balance_loss_mlp": 1.00347614, + "epoch": 0.4147177298142247, + "flos": 24168456034560.0, + "grad_norm": 1.7857569160348352, + "language_loss": 0.80156362, + "learning_rate": 2.6382975919994667e-06, + "loss": 0.82374895, + "num_input_tokens_seen": 74453760, + "step": 3449, + "time_per_iteration": 3.7101891040802 + }, + { + "auxiliary_loss_clip": 0.01126427, + "auxiliary_loss_mlp": 0.01084644, + "balance_loss_clip": 1.03035879, + "balance_loss_mlp": 1.00410652, + "epoch": 0.41483797270486383, + "flos": 20084659056000.0, + "grad_norm": 2.463493620573062, + "language_loss": 0.7314679, + "learning_rate": 2.637559308574822e-06, + "loss": 0.75357866, + "num_input_tokens_seen": 74473505, + "step": 3450, + "time_per_iteration": 2.7653772830963135 + }, + { + "auxiliary_loss_clip": 0.01144752, + "auxiliary_loss_mlp": 0.01084971, + "balance_loss_clip": 1.03191161, + "balance_loss_mlp": 1.00428987, + "epoch": 0.4149582155955029, + "flos": 30081040110720.0, + "grad_norm": 1.9627068030450419, + "language_loss": 0.71349418, + "learning_rate": 2.6368209284329376e-06, + "loss": 0.73579139, + "num_input_tokens_seen": 74494135, + "step": 3451, + "time_per_iteration": 3.753335475921631 + }, + { + "auxiliary_loss_clip": 0.01136087, + "auxiliary_loss_mlp": 0.01085172, + "balance_loss_clip": 1.03161287, + "balance_loss_mlp": 1.00434852, + "epoch": 0.415078458486142, + "flos": 16764394504320.0, + "grad_norm": 1.74054478479153, + "language_loss": 0.75557238, + "learning_rate": 2.636082451685825e-06, + "loss": 0.77778494, + "num_input_tokens_seen": 74512335, + "step": 3452, + "time_per_iteration": 2.6958365440368652 + }, + { + "auxiliary_loss_clip": 0.0112632, + "auxiliary_loss_mlp": 0.01087513, + "balance_loss_clip": 1.03078175, + "balance_loss_mlp": 1.00673652, + "epoch": 0.4151987013767811, + "flos": 26033692458240.0, + "grad_norm": 2.2087024574525747, + "language_loss": 0.86364102, + "learning_rate": 2.6353438784455094e-06, + "loss": 0.88577938, + "num_input_tokens_seen": 74535620, + "step": 3453, + "time_per_iteration": 2.9033472537994385 + }, + { + "auxiliary_loss_clip": 0.01126579, + "auxiliary_loss_mlp": 0.01086228, + "balance_loss_clip": 1.03128195, + "balance_loss_mlp": 1.00540435, + "epoch": 0.41531894426742016, + "flos": 24608002763520.0, + "grad_norm": 2.223629917988934, + "language_loss": 0.71567303, + "learning_rate": 2.6346052088240326e-06, + "loss": 0.73780107, + "num_input_tokens_seen": 74555140, + "step": 3454, + "time_per_iteration": 3.688016891479492 + }, + { + "auxiliary_loss_clip": 0.01124963, + "auxiliary_loss_mlp": 0.01086131, + "balance_loss_clip": 1.03003144, + "balance_loss_mlp": 1.00530684, + "epoch": 0.4154391871580593, + "flos": 14975791747200.0, + "grad_norm": 1.7524311251312303, + "language_loss": 0.77581751, + "learning_rate": 2.63386644293345e-06, + "loss": 0.79792845, + "num_input_tokens_seen": 74571485, + "step": 3455, + "time_per_iteration": 2.7928547859191895 + }, + { + "auxiliary_loss_clip": 0.01120089, + "auxiliary_loss_mlp": 0.01086276, + "balance_loss_clip": 1.03139162, + "balance_loss_mlp": 1.00564337, + "epoch": 0.4155594300486984, + "flos": 14647173194880.0, + "grad_norm": 2.258755595381477, + "language_loss": 0.83197218, + "learning_rate": 2.633127580885833e-06, + "loss": 0.85403585, + "num_input_tokens_seen": 74585985, + "step": 3456, + "time_per_iteration": 2.758185863494873 + }, + { + "auxiliary_loss_clip": 0.01144894, + "auxiliary_loss_mlp": 0.01085683, + "balance_loss_clip": 1.03243577, + "balance_loss_mlp": 1.00490677, + "epoch": 0.41567967293933744, + "flos": 29497276275840.0, + "grad_norm": 2.056699258924477, + "language_loss": 0.65035975, + "learning_rate": 2.632388622793265e-06, + "loss": 0.67266554, + "num_input_tokens_seen": 74605140, + "step": 3457, + "time_per_iteration": 2.7153432369232178 + }, + { + "auxiliary_loss_clip": 0.01138225, + "auxiliary_loss_mlp": 0.01086107, + "balance_loss_clip": 1.03274798, + "balance_loss_mlp": 1.0053308, + "epoch": 0.41579991582997655, + "flos": 19238387650560.0, + "grad_norm": 1.8907032798917847, + "language_loss": 0.67827147, + "learning_rate": 2.6316495687678457e-06, + "loss": 0.70051479, + "num_input_tokens_seen": 74623790, + "step": 3458, + "time_per_iteration": 2.6866726875305176 + }, + { + "auxiliary_loss_clip": 0.01105794, + "auxiliary_loss_mlp": 0.0108492, + "balance_loss_clip": 1.02792966, + "balance_loss_mlp": 1.00423908, + "epoch": 0.41592015872061566, + "flos": 24462061804800.0, + "grad_norm": 2.2881767101641457, + "language_loss": 0.76200795, + "learning_rate": 2.6309104189216887e-06, + "loss": 0.7839151, + "num_input_tokens_seen": 74641355, + "step": 3459, + "time_per_iteration": 2.8591582775115967 + }, + { + "auxiliary_loss_clip": 0.01115333, + "auxiliary_loss_mlp": 0.00873409, + "balance_loss_clip": 1.02918243, + "balance_loss_mlp": 1.00045085, + "epoch": 0.4160404016112547, + "flos": 20775651966720.0, + "grad_norm": 2.9535093851826697, + "language_loss": 0.74703658, + "learning_rate": 2.630171173366923e-06, + "loss": 0.76692396, + "num_input_tokens_seen": 74657155, + "step": 3460, + "time_per_iteration": 2.7872235774993896 + }, + { + "auxiliary_loss_clip": 0.01108964, + "auxiliary_loss_mlp": 0.01086791, + "balance_loss_clip": 1.02986825, + "balance_loss_mlp": 1.00591946, + "epoch": 0.41616064450189383, + "flos": 13916462820480.0, + "grad_norm": 3.549866947186479, + "language_loss": 0.74150717, + "learning_rate": 2.629431832215691e-06, + "loss": 0.76346475, + "num_input_tokens_seen": 74671960, + "step": 3461, + "time_per_iteration": 2.824216365814209 + }, + { + "auxiliary_loss_clip": 0.01123045, + "auxiliary_loss_mlp": 0.01085143, + "balance_loss_clip": 1.02895331, + "balance_loss_mlp": 1.00436711, + "epoch": 0.41628088739253294, + "flos": 20010826650240.0, + "grad_norm": 1.594607471142716, + "language_loss": 0.87091887, + "learning_rate": 2.628692395580151e-06, + "loss": 0.89300072, + "num_input_tokens_seen": 74692050, + "step": 3462, + "time_per_iteration": 2.7032885551452637 + }, + { + "auxiliary_loss_clip": 0.0109816, + "auxiliary_loss_mlp": 0.01085586, + "balance_loss_clip": 1.02775133, + "balance_loss_mlp": 1.00485778, + "epoch": 0.416401130283172, + "flos": 29168801377920.0, + "grad_norm": 1.6547880607103358, + "language_loss": 0.79405183, + "learning_rate": 2.6279528635724747e-06, + "loss": 0.81588924, + "num_input_tokens_seen": 74712205, + "step": 3463, + "time_per_iteration": 2.8834333419799805 + }, + { + "auxiliary_loss_clip": 0.01138025, + "auxiliary_loss_mlp": 0.01086604, + "balance_loss_clip": 1.03284895, + "balance_loss_mlp": 1.00563741, + "epoch": 0.4165213731738111, + "flos": 16246813478400.0, + "grad_norm": 2.8059597941049965, + "language_loss": 0.78697157, + "learning_rate": 2.627213236304848e-06, + "loss": 0.80921793, + "num_input_tokens_seen": 74729005, + "step": 3464, + "time_per_iteration": 2.7301130294799805 + }, + { + "auxiliary_loss_clip": 0.01136169, + "auxiliary_loss_mlp": 0.01085425, + "balance_loss_clip": 1.03114355, + "balance_loss_mlp": 1.00469613, + "epoch": 0.4166416160644502, + "flos": 33765438787200.0, + "grad_norm": 1.727407576792159, + "language_loss": 0.7060045, + "learning_rate": 2.626473513889472e-06, + "loss": 0.72822046, + "num_input_tokens_seen": 74751385, + "step": 3465, + "time_per_iteration": 2.794200897216797 + }, + { + "auxiliary_loss_clip": 0.01136684, + "auxiliary_loss_mlp": 0.01085832, + "balance_loss_clip": 1.03193235, + "balance_loss_mlp": 1.00515079, + "epoch": 0.41676185895508927, + "flos": 20917498775040.0, + "grad_norm": 1.7615287178291668, + "language_loss": 0.829642, + "learning_rate": 2.625733696438562e-06, + "loss": 0.85186714, + "num_input_tokens_seen": 74768890, + "step": 3466, + "time_per_iteration": 2.755178451538086 + }, + { + "auxiliary_loss_clip": 0.01127194, + "auxiliary_loss_mlp": 0.01087465, + "balance_loss_clip": 1.03066003, + "balance_loss_mlp": 1.00664091, + "epoch": 0.4168821018457284, + "flos": 18406122549120.0, + "grad_norm": 1.681217935482829, + "language_loss": 0.74994415, + "learning_rate": 2.6249937840643476e-06, + "loss": 0.77209073, + "num_input_tokens_seen": 74787195, + "step": 3467, + "time_per_iteration": 3.6480894088745117 + }, + { + "auxiliary_loss_clip": 0.01146694, + "auxiliary_loss_mlp": 0.00873377, + "balance_loss_clip": 1.03408408, + "balance_loss_mlp": 1.00052834, + "epoch": 0.41700234473636744, + "flos": 18698399516160.0, + "grad_norm": 1.6654737300943534, + "language_loss": 0.66816032, + "learning_rate": 2.6242537768790733e-06, + "loss": 0.68836105, + "num_input_tokens_seen": 74806350, + "step": 3468, + "time_per_iteration": 2.673354148864746 + }, + { + "auxiliary_loss_clip": 0.01134442, + "auxiliary_loss_mlp": 0.01087122, + "balance_loss_clip": 1.03036296, + "balance_loss_mlp": 1.00615478, + "epoch": 0.41712258762700655, + "flos": 31033283616000.0, + "grad_norm": 1.7247215482497704, + "language_loss": 0.69015187, + "learning_rate": 2.6235136749949975e-06, + "loss": 0.71236753, + "num_input_tokens_seen": 74829800, + "step": 3469, + "time_per_iteration": 2.731410503387451 + }, + { + "auxiliary_loss_clip": 0.01145437, + "auxiliary_loss_mlp": 0.0108437, + "balance_loss_clip": 1.03264904, + "balance_loss_mlp": 1.00359344, + "epoch": 0.41724283051764566, + "flos": 35914763877120.0, + "grad_norm": 1.9362902708791065, + "language_loss": 0.61478335, + "learning_rate": 2.6227734785243924e-06, + "loss": 0.63708138, + "num_input_tokens_seen": 74849760, + "step": 3470, + "time_per_iteration": 2.8918752670288086 + }, + { + "auxiliary_loss_clip": 0.01095491, + "auxiliary_loss_mlp": 0.01085934, + "balance_loss_clip": 1.0262413, + "balance_loss_mlp": 1.00520539, + "epoch": 0.4173630734082847, + "flos": 25333649320320.0, + "grad_norm": 2.2198336211619445, + "language_loss": 0.79228532, + "learning_rate": 2.6220331875795466e-06, + "loss": 0.81409955, + "num_input_tokens_seen": 74869110, + "step": 3471, + "time_per_iteration": 2.885037899017334 + }, + { + "auxiliary_loss_clip": 0.01138232, + "auxiliary_loss_mlp": 0.01084518, + "balance_loss_clip": 1.03304267, + "balance_loss_mlp": 1.00374222, + "epoch": 0.4174833162989238, + "flos": 26685398868480.0, + "grad_norm": 1.716164146606548, + "language_loss": 0.7529977, + "learning_rate": 2.62129280227276e-06, + "loss": 0.77522516, + "num_input_tokens_seen": 74889110, + "step": 3472, + "time_per_iteration": 2.6881353855133057 + }, + { + "auxiliary_loss_clip": 0.01137941, + "auxiliary_loss_mlp": 0.01086841, + "balance_loss_clip": 1.03278208, + "balance_loss_mlp": 1.00596964, + "epoch": 0.41760355918956293, + "flos": 74739584010240.0, + "grad_norm": 1.8499085349923619, + "language_loss": 0.68562621, + "learning_rate": 2.62055232271635e-06, + "loss": 0.70787406, + "num_input_tokens_seen": 74916260, + "step": 3473, + "time_per_iteration": 3.1483490467071533 + }, + { + "auxiliary_loss_clip": 0.0112048, + "auxiliary_loss_mlp": 0.01086229, + "balance_loss_clip": 1.03200197, + "balance_loss_mlp": 1.00540495, + "epoch": 0.417723802080202, + "flos": 14317513148160.0, + "grad_norm": 1.9279672593825874, + "language_loss": 0.87487316, + "learning_rate": 2.619811749022646e-06, + "loss": 0.89694023, + "num_input_tokens_seen": 74931570, + "step": 3474, + "time_per_iteration": 3.856025457382202 + }, + { + "auxiliary_loss_clip": 0.01135759, + "auxiliary_loss_mlp": 0.01086862, + "balance_loss_clip": 1.03211737, + "balance_loss_mlp": 1.00589514, + "epoch": 0.4178440449708411, + "flos": 14643797316480.0, + "grad_norm": 6.208524333540945, + "language_loss": 0.71386743, + "learning_rate": 2.6190710813039917e-06, + "loss": 0.73609364, + "num_input_tokens_seen": 74944695, + "step": 3475, + "time_per_iteration": 2.6646721363067627 + }, + { + "auxiliary_loss_clip": 0.01101168, + "auxiliary_loss_mlp": 0.00873592, + "balance_loss_clip": 1.02722692, + "balance_loss_mlp": 1.00049174, + "epoch": 0.4179642878614802, + "flos": 21507296094720.0, + "grad_norm": 2.9346386342477184, + "language_loss": 0.83925295, + "learning_rate": 2.618330319672747e-06, + "loss": 0.85900056, + "num_input_tokens_seen": 74964115, + "step": 3476, + "time_per_iteration": 3.8331081867218018 + }, + { + "auxiliary_loss_clip": 0.01146071, + "auxiliary_loss_mlp": 0.01084911, + "balance_loss_clip": 1.03272009, + "balance_loss_mlp": 1.0041821, + "epoch": 0.41808453075211927, + "flos": 18441997257600.0, + "grad_norm": 2.452462711357869, + "language_loss": 0.92276788, + "learning_rate": 2.617589464241284e-06, + "loss": 0.94507772, + "num_input_tokens_seen": 74978515, + "step": 3477, + "time_per_iteration": 2.662808895111084 + }, + { + "auxiliary_loss_clip": 0.01100838, + "auxiliary_loss_mlp": 0.0108706, + "balance_loss_clip": 1.02955616, + "balance_loss_mlp": 1.00647449, + "epoch": 0.4182047736427584, + "flos": 20301020628480.0, + "grad_norm": 1.9752417840639356, + "language_loss": 0.74532568, + "learning_rate": 2.6168485151219914e-06, + "loss": 0.76720464, + "num_input_tokens_seen": 74998135, + "step": 3478, + "time_per_iteration": 2.812809705734253 + }, + { + "auxiliary_loss_clip": 0.01134852, + "auxiliary_loss_mlp": 0.01085222, + "balance_loss_clip": 1.03116536, + "balance_loss_mlp": 1.00449324, + "epoch": 0.4183250165333975, + "flos": 18876623823360.0, + "grad_norm": 2.0979455950038184, + "language_loss": 0.71414709, + "learning_rate": 2.616107472427269e-06, + "loss": 0.73634779, + "num_input_tokens_seen": 75012830, + "step": 3479, + "time_per_iteration": 2.677825450897217 + }, + { + "auxiliary_loss_clip": 0.0113651, + "auxiliary_loss_mlp": 0.010851, + "balance_loss_clip": 1.03117633, + "balance_loss_mlp": 1.00418067, + "epoch": 0.41844525942403654, + "flos": 17740050698880.0, + "grad_norm": 2.49069787850026, + "language_loss": 0.76296645, + "learning_rate": 2.615366336269533e-06, + "loss": 0.78518254, + "num_input_tokens_seen": 75026495, + "step": 3480, + "time_per_iteration": 3.5849130153656006 + }, + { + "auxiliary_loss_clip": 0.01144903, + "auxiliary_loss_mlp": 0.01087708, + "balance_loss_clip": 1.0320394, + "balance_loss_mlp": 1.00678873, + "epoch": 0.41856550231467565, + "flos": 18361377181440.0, + "grad_norm": 2.204335955541526, + "language_loss": 0.80132174, + "learning_rate": 2.6146251067612126e-06, + "loss": 0.82364786, + "num_input_tokens_seen": 75041970, + "step": 3481, + "time_per_iteration": 2.627476692199707 + }, + { + "auxiliary_loss_clip": 0.01131213, + "auxiliary_loss_mlp": 0.01086164, + "balance_loss_clip": 1.02925849, + "balance_loss_mlp": 1.0053879, + "epoch": 0.41868574520531476, + "flos": 22781801445120.0, + "grad_norm": 1.5223177753326906, + "language_loss": 0.82753301, + "learning_rate": 2.6138837840147525e-06, + "loss": 0.84970677, + "num_input_tokens_seen": 75061005, + "step": 3482, + "time_per_iteration": 2.6831719875335693 + }, + { + "auxiliary_loss_clip": 0.01115566, + "auxiliary_loss_mlp": 0.01086996, + "balance_loss_clip": 1.02911806, + "balance_loss_mlp": 1.00621998, + "epoch": 0.4188059880959538, + "flos": 13699167494400.0, + "grad_norm": 2.0209390418506934, + "language_loss": 0.76129103, + "learning_rate": 2.6131423681426103e-06, + "loss": 0.78331661, + "num_input_tokens_seen": 75076920, + "step": 3483, + "time_per_iteration": 2.7630808353424072 + }, + { + "auxiliary_loss_clip": 0.01145643, + "auxiliary_loss_mlp": 0.01086789, + "balance_loss_clip": 1.03295207, + "balance_loss_mlp": 1.00610805, + "epoch": 0.41892623098659293, + "flos": 37818281220480.0, + "grad_norm": 1.66891900538285, + "language_loss": 0.73044884, + "learning_rate": 2.6124008592572587e-06, + "loss": 0.75277317, + "num_input_tokens_seen": 75100905, + "step": 3484, + "time_per_iteration": 2.831474542617798 + }, + { + "auxiliary_loss_clip": 0.01144462, + "auxiliary_loss_mlp": 0.01085003, + "balance_loss_clip": 1.03110504, + "balance_loss_mlp": 1.00413156, + "epoch": 0.419046473877232, + "flos": 23258874908160.0, + "grad_norm": 3.073206915155234, + "language_loss": 0.81736398, + "learning_rate": 2.6116592574711835e-06, + "loss": 0.83965862, + "num_input_tokens_seen": 75119205, + "step": 3485, + "time_per_iteration": 2.7048990726470947 + }, + { + "auxiliary_loss_clip": 0.01147198, + "auxiliary_loss_mlp": 0.01085719, + "balance_loss_clip": 1.03385127, + "balance_loss_mlp": 1.0048002, + "epoch": 0.4191667167678711, + "flos": 20741034234240.0, + "grad_norm": 1.8945890899793583, + "language_loss": 0.84254235, + "learning_rate": 2.6109175628968853e-06, + "loss": 0.86487144, + "num_input_tokens_seen": 75138970, + "step": 3486, + "time_per_iteration": 2.7001733779907227 + }, + { + "auxiliary_loss_clip": 0.01136556, + "auxiliary_loss_mlp": 0.01085723, + "balance_loss_clip": 1.03231859, + "balance_loss_mlp": 1.00504184, + "epoch": 0.4192869596585102, + "flos": 23586416052480.0, + "grad_norm": 1.8374850581469893, + "language_loss": 0.82710636, + "learning_rate": 2.610175775646878e-06, + "loss": 0.84932911, + "num_input_tokens_seen": 75157550, + "step": 3487, + "time_per_iteration": 2.735783815383911 + }, + { + "auxiliary_loss_clip": 0.01126262, + "auxiliary_loss_mlp": 0.01084923, + "balance_loss_clip": 1.03055382, + "balance_loss_mlp": 1.00414634, + "epoch": 0.41940720254914926, + "flos": 25081269384960.0, + "grad_norm": 1.811510733272511, + "language_loss": 0.73095572, + "learning_rate": 2.6094338958336907e-06, + "loss": 0.75306755, + "num_input_tokens_seen": 75176220, + "step": 3488, + "time_per_iteration": 2.7573156356811523 + }, + { + "auxiliary_loss_clip": 0.01125422, + "auxiliary_loss_mlp": 0.010858, + "balance_loss_clip": 1.03096986, + "balance_loss_mlp": 1.00516653, + "epoch": 0.41952744543978837, + "flos": 15554132628480.0, + "grad_norm": 2.222955691772628, + "language_loss": 0.82321739, + "learning_rate": 2.608691923569867e-06, + "loss": 0.84532964, + "num_input_tokens_seen": 75193095, + "step": 3489, + "time_per_iteration": 2.7277157306671143 + }, + { + "auxiliary_loss_clip": 0.01137562, + "auxiliary_loss_mlp": 0.01087234, + "balance_loss_clip": 1.03302765, + "balance_loss_mlp": 1.0063622, + "epoch": 0.4196476883304275, + "flos": 24644775312000.0, + "grad_norm": 1.5957770326411989, + "language_loss": 0.76073301, + "learning_rate": 2.6079498589679616e-06, + "loss": 0.78298098, + "num_input_tokens_seen": 75214185, + "step": 3490, + "time_per_iteration": 2.7152819633483887 + }, + { + "auxiliary_loss_clip": 0.0109916, + "auxiliary_loss_mlp": 0.01086472, + "balance_loss_clip": 1.02827621, + "balance_loss_mlp": 1.00550508, + "epoch": 0.41976793122106654, + "flos": 24531333183360.0, + "grad_norm": 1.856231638643446, + "language_loss": 0.76117224, + "learning_rate": 2.6072077021405465e-06, + "loss": 0.78302854, + "num_input_tokens_seen": 75233020, + "step": 3491, + "time_per_iteration": 2.882774591445923 + }, + { + "auxiliary_loss_clip": 0.01119346, + "auxiliary_loss_mlp": 0.01085739, + "balance_loss_clip": 1.03096986, + "balance_loss_mlp": 1.00505781, + "epoch": 0.41988817411170565, + "flos": 21175301664000.0, + "grad_norm": 1.6340389318578592, + "language_loss": 0.69064593, + "learning_rate": 2.6064654532002054e-06, + "loss": 0.71269679, + "num_input_tokens_seen": 75252030, + "step": 3492, + "time_per_iteration": 2.8568923473358154 + }, + { + "auxiliary_loss_clip": 0.01145482, + "auxiliary_loss_mlp": 0.01086727, + "balance_loss_clip": 1.03237152, + "balance_loss_mlp": 1.0060457, + "epoch": 0.42000841700234476, + "flos": 31649402626560.0, + "grad_norm": 1.4344321857873759, + "language_loss": 0.75814933, + "learning_rate": 2.6057231122595375e-06, + "loss": 0.78047144, + "num_input_tokens_seen": 75273340, + "step": 3493, + "time_per_iteration": 3.7060930728912354 + }, + { + "auxiliary_loss_clip": 0.01125995, + "auxiliary_loss_mlp": 0.01085739, + "balance_loss_clip": 1.03046405, + "balance_loss_mlp": 1.00491536, + "epoch": 0.4201286598929838, + "flos": 21281525159040.0, + "grad_norm": 1.6342300673460592, + "language_loss": 0.73166132, + "learning_rate": 2.604980679431154e-06, + "loss": 0.7537787, + "num_input_tokens_seen": 75291580, + "step": 3494, + "time_per_iteration": 2.735081672668457 + }, + { + "auxiliary_loss_clip": 0.01136943, + "auxiliary_loss_mlp": 0.01085915, + "balance_loss_clip": 1.03166699, + "balance_loss_mlp": 1.005234, + "epoch": 0.4202489027836229, + "flos": 18546532813440.0, + "grad_norm": 2.207345569989718, + "language_loss": 0.74489272, + "learning_rate": 2.604238154827684e-06, + "loss": 0.76712132, + "num_input_tokens_seen": 75308205, + "step": 3495, + "time_per_iteration": 2.6928186416625977 + }, + { + "auxiliary_loss_clip": 0.01138246, + "auxiliary_loss_mlp": 0.01084651, + "balance_loss_clip": 1.03351974, + "balance_loss_mlp": 1.00397062, + "epoch": 0.42036914567426203, + "flos": 19317643009920.0, + "grad_norm": 3.0968477414605586, + "language_loss": 0.72206795, + "learning_rate": 2.6034955385617656e-06, + "loss": 0.74429691, + "num_input_tokens_seen": 75326535, + "step": 3496, + "time_per_iteration": 2.6899073123931885 + }, + { + "auxiliary_loss_clip": 0.0111248, + "auxiliary_loss_mlp": 0.01079648, + "balance_loss_clip": 1.03923345, + "balance_loss_mlp": 1.0003022, + "epoch": 0.4204893885649011, + "flos": 67842942935040.0, + "grad_norm": 0.7218751276333377, + "language_loss": 0.6165005, + "learning_rate": 2.6027528307460544e-06, + "loss": 0.63842177, + "num_input_tokens_seen": 75390540, + "step": 3497, + "time_per_iteration": 3.4059653282165527 + }, + { + "auxiliary_loss_clip": 0.01146347, + "auxiliary_loss_mlp": 0.01085431, + "balance_loss_clip": 1.03318477, + "balance_loss_mlp": 1.00460696, + "epoch": 0.4206096314555402, + "flos": 21908777385600.0, + "grad_norm": 1.7264790796474891, + "language_loss": 0.86516547, + "learning_rate": 2.602010031493217e-06, + "loss": 0.88748318, + "num_input_tokens_seen": 75408770, + "step": 3498, + "time_per_iteration": 2.6712703704833984 + }, + { + "auxiliary_loss_clip": 0.01109129, + "auxiliary_loss_mlp": 0.01086074, + "balance_loss_clip": 1.02512383, + "balance_loss_mlp": 1.00534511, + "epoch": 0.42072987434617926, + "flos": 29278185269760.0, + "grad_norm": 1.6845913645198027, + "language_loss": 0.8643043, + "learning_rate": 2.6012671409159367e-06, + "loss": 0.88625628, + "num_input_tokens_seen": 75430105, + "step": 3499, + "time_per_iteration": 3.759227991104126 + }, + { + "auxiliary_loss_clip": 0.01125481, + "auxiliary_loss_mlp": 0.01084273, + "balance_loss_clip": 1.03111482, + "balance_loss_mlp": 1.00354493, + "epoch": 0.42085011723681837, + "flos": 27600726170880.0, + "grad_norm": 1.7582024494746589, + "language_loss": 0.81631255, + "learning_rate": 2.6005241591269097e-06, + "loss": 0.83841014, + "num_input_tokens_seen": 75449475, + "step": 3500, + "time_per_iteration": 2.7988193035125732 + }, + { + "auxiliary_loss_clip": 0.01108033, + "auxiliary_loss_mlp": 0.01085192, + "balance_loss_clip": 1.02794456, + "balance_loss_mlp": 1.0045588, + "epoch": 0.4209703601274575, + "flos": 27818632028160.0, + "grad_norm": 1.6597273624014788, + "language_loss": 0.79774559, + "learning_rate": 2.5997810862388454e-06, + "loss": 0.81967789, + "num_input_tokens_seen": 75469315, + "step": 3501, + "time_per_iteration": 2.7998368740081787 + }, + { + "auxiliary_loss_clip": 0.01126705, + "auxiliary_loss_mlp": 0.01086899, + "balance_loss_clip": 1.0306263, + "balance_loss_mlp": 1.0060277, + "epoch": 0.42109060301809653, + "flos": 27525529048320.0, + "grad_norm": 1.8724465321379065, + "language_loss": 0.75567603, + "learning_rate": 2.599037922364467e-06, + "loss": 0.777812, + "num_input_tokens_seen": 75488215, + "step": 3502, + "time_per_iteration": 3.740635395050049 + }, + { + "auxiliary_loss_clip": 0.01107603, + "auxiliary_loss_mlp": 0.01084881, + "balance_loss_clip": 1.02465534, + "balance_loss_mlp": 1.00419974, + "epoch": 0.42121084590873564, + "flos": 29314275459840.0, + "grad_norm": 1.9072130488926617, + "language_loss": 0.74856412, + "learning_rate": 2.5982946676165112e-06, + "loss": 0.77048898, + "num_input_tokens_seen": 75507985, + "step": 3503, + "time_per_iteration": 2.8712923526763916 + }, + { + "auxiliary_loss_clip": 0.01105715, + "auxiliary_loss_mlp": 0.01080173, + "balance_loss_clip": 1.03314769, + "balance_loss_mlp": 1.00082731, + "epoch": 0.42133108879937475, + "flos": 67398835178880.0, + "grad_norm": 0.7277439089558815, + "language_loss": 0.57642734, + "learning_rate": 2.5975513221077313e-06, + "loss": 0.59828627, + "num_input_tokens_seen": 75571955, + "step": 3504, + "time_per_iteration": 4.34759521484375 + }, + { + "auxiliary_loss_clip": 0.01127486, + "auxiliary_loss_mlp": 0.0108605, + "balance_loss_clip": 1.03199172, + "balance_loss_mlp": 1.00536895, + "epoch": 0.4214513316900138, + "flos": 23106038538240.0, + "grad_norm": 2.288932572658597, + "language_loss": 0.88405263, + "learning_rate": 2.5968078859508897e-06, + "loss": 0.90618801, + "num_input_tokens_seen": 75589155, + "step": 3505, + "time_per_iteration": 2.7959868907928467 + }, + { + "auxiliary_loss_clip": 0.01134101, + "auxiliary_loss_mlp": 0.01085486, + "balance_loss_clip": 1.03042722, + "balance_loss_mlp": 1.00480521, + "epoch": 0.4215715745806529, + "flos": 15336190857600.0, + "grad_norm": 1.9548394257336945, + "language_loss": 0.79853851, + "learning_rate": 2.5960643592587673e-06, + "loss": 0.82073438, + "num_input_tokens_seen": 75606565, + "step": 3506, + "time_per_iteration": 2.7344682216644287 + }, + { + "auxiliary_loss_clip": 0.01115455, + "auxiliary_loss_mlp": 0.01085474, + "balance_loss_clip": 1.02889049, + "balance_loss_mlp": 1.00474501, + "epoch": 0.42169181747129203, + "flos": 22127257860480.0, + "grad_norm": 1.922265712839127, + "language_loss": 0.81615245, + "learning_rate": 2.5953207421441553e-06, + "loss": 0.83816177, + "num_input_tokens_seen": 75625165, + "step": 3507, + "time_per_iteration": 2.7772862911224365 + }, + { + "auxiliary_loss_clip": 0.01113744, + "auxiliary_loss_mlp": 0.01085703, + "balance_loss_clip": 1.02836204, + "balance_loss_mlp": 1.0049746, + "epoch": 0.4218120603619311, + "flos": 22630724841600.0, + "grad_norm": 2.4424881056393337, + "language_loss": 0.75215399, + "learning_rate": 2.5945770347198603e-06, + "loss": 0.7741484, + "num_input_tokens_seen": 75643320, + "step": 3508, + "time_per_iteration": 2.7852745056152344 + }, + { + "auxiliary_loss_clip": 0.01111101, + "auxiliary_loss_mlp": 0.01084273, + "balance_loss_clip": 1.03044486, + "balance_loss_mlp": 1.00359213, + "epoch": 0.4219323032525702, + "flos": 19682818629120.0, + "grad_norm": 1.713169605788004, + "language_loss": 0.8198514, + "learning_rate": 2.593833237098701e-06, + "loss": 0.8418051, + "num_input_tokens_seen": 75660920, + "step": 3509, + "time_per_iteration": 2.7140660285949707 + }, + { + "auxiliary_loss_clip": 0.01135333, + "auxiliary_loss_mlp": 0.0108655, + "balance_loss_clip": 1.0301702, + "balance_loss_mlp": 1.00558352, + "epoch": 0.4220525461432093, + "flos": 30190747224960.0, + "grad_norm": 1.90470498071943, + "language_loss": 0.62284791, + "learning_rate": 2.593089349393512e-06, + "loss": 0.64506674, + "num_input_tokens_seen": 75681410, + "step": 3510, + "time_per_iteration": 2.8064045906066895 + }, + { + "auxiliary_loss_clip": 0.01131123, + "auxiliary_loss_mlp": 0.01086122, + "balance_loss_clip": 1.02891266, + "balance_loss_mlp": 1.00529838, + "epoch": 0.42217278903384836, + "flos": 24315941278080.0, + "grad_norm": 1.9018951990035178, + "language_loss": 0.83772826, + "learning_rate": 2.592345371717141e-06, + "loss": 0.85990071, + "num_input_tokens_seen": 75700940, + "step": 3511, + "time_per_iteration": 2.7107014656066895 + }, + { + "auxiliary_loss_clip": 0.01138015, + "auxiliary_loss_mlp": 0.01085212, + "balance_loss_clip": 1.0336622, + "balance_loss_mlp": 1.00462627, + "epoch": 0.42229303192448747, + "flos": 17092474352640.0, + "grad_norm": 15.217670360517557, + "language_loss": 0.71867895, + "learning_rate": 2.591601304182448e-06, + "loss": 0.74091125, + "num_input_tokens_seen": 75718910, + "step": 3512, + "time_per_iteration": 2.6978373527526855 + }, + { + "auxiliary_loss_clip": 0.01126665, + "auxiliary_loss_mlp": 0.01086401, + "balance_loss_clip": 1.03150403, + "balance_loss_mlp": 1.00581598, + "epoch": 0.4224132748151266, + "flos": 22784530878720.0, + "grad_norm": 1.715768272958443, + "language_loss": 0.79199529, + "learning_rate": 2.5908571469023067e-06, + "loss": 0.8141259, + "num_input_tokens_seen": 75738395, + "step": 3513, + "time_per_iteration": 2.778390407562256 + }, + { + "auxiliary_loss_clip": 0.01145488, + "auxiliary_loss_mlp": 0.01084815, + "balance_loss_clip": 1.03286362, + "balance_loss_mlp": 1.00403845, + "epoch": 0.42253351770576564, + "flos": 17819090576640.0, + "grad_norm": 2.417099061779706, + "language_loss": 0.75305629, + "learning_rate": 2.5901128999896067e-06, + "loss": 0.77535939, + "num_input_tokens_seen": 75753825, + "step": 3514, + "time_per_iteration": 2.6440720558166504 + }, + { + "auxiliary_loss_clip": 0.01135343, + "auxiliary_loss_mlp": 0.01084941, + "balance_loss_clip": 1.03205252, + "balance_loss_mlp": 1.00430822, + "epoch": 0.42265376059640475, + "flos": 28512390286080.0, + "grad_norm": 1.6568369184697544, + "language_loss": 0.68045163, + "learning_rate": 2.5893685635572487e-06, + "loss": 0.70265448, + "num_input_tokens_seen": 75774675, + "step": 3515, + "time_per_iteration": 2.712810516357422 + }, + { + "auxiliary_loss_clip": 0.01122431, + "auxiliary_loss_mlp": 0.01084982, + "balance_loss_clip": 1.02913618, + "balance_loss_mlp": 1.00396729, + "epoch": 0.4227740034870438, + "flos": 16253349753600.0, + "grad_norm": 2.4369243596364183, + "language_loss": 0.69276226, + "learning_rate": 2.5886241377181483e-06, + "loss": 0.71483636, + "num_input_tokens_seen": 75793545, + "step": 3516, + "time_per_iteration": 2.723500967025757 + }, + { + "auxiliary_loss_clip": 0.01135113, + "auxiliary_loss_mlp": 0.01085982, + "balance_loss_clip": 1.03152323, + "balance_loss_mlp": 1.00511014, + "epoch": 0.4228942463776829, + "flos": 25295691623040.0, + "grad_norm": 1.820720524335878, + "language_loss": 0.81457734, + "learning_rate": 2.587879622585234e-06, + "loss": 0.8367883, + "num_input_tokens_seen": 75812145, + "step": 3517, + "time_per_iteration": 2.7785115242004395 + }, + { + "auxiliary_loss_clip": 0.01135818, + "auxiliary_loss_mlp": 0.01086016, + "balance_loss_clip": 1.03287005, + "balance_loss_mlp": 1.0053829, + "epoch": 0.423014489268322, + "flos": 26395779507840.0, + "grad_norm": 2.1192546154371423, + "language_loss": 0.7576654, + "learning_rate": 2.5871350182714486e-06, + "loss": 0.77988368, + "num_input_tokens_seen": 75833025, + "step": 3518, + "time_per_iteration": 3.7394659519195557 + }, + { + "auxiliary_loss_clip": 0.01144742, + "auxiliary_loss_mlp": 0.01085269, + "balance_loss_clip": 1.03198624, + "balance_loss_mlp": 1.00463581, + "epoch": 0.4231347321589611, + "flos": 17274002711040.0, + "grad_norm": 1.9766187523597802, + "language_loss": 0.80288994, + "learning_rate": 2.586390324889748e-06, + "loss": 0.82519007, + "num_input_tokens_seen": 75848925, + "step": 3519, + "time_per_iteration": 2.665173053741455 + }, + { + "auxiliary_loss_clip": 0.0112793, + "auxiliary_loss_mlp": 0.01086719, + "balance_loss_clip": 1.02979302, + "balance_loss_mlp": 1.00599015, + "epoch": 0.4232549750496002, + "flos": 22999635475200.0, + "grad_norm": 1.8302385112578705, + "language_loss": 0.67333162, + "learning_rate": 2.5856455425531003e-06, + "loss": 0.69547808, + "num_input_tokens_seen": 75870400, + "step": 3520, + "time_per_iteration": 2.737337589263916 + }, + { + "auxiliary_loss_clip": 0.01136166, + "auxiliary_loss_mlp": 0.01084656, + "balance_loss_clip": 1.03304648, + "balance_loss_mlp": 1.0040226, + "epoch": 0.4233752179402393, + "flos": 21248343970560.0, + "grad_norm": 1.6698288418019291, + "language_loss": 0.80837035, + "learning_rate": 2.5849006713744902e-06, + "loss": 0.83057851, + "num_input_tokens_seen": 75889195, + "step": 3521, + "time_per_iteration": 2.7134954929351807 + }, + { + "auxiliary_loss_clip": 0.0112663, + "auxiliary_loss_mlp": 0.01085565, + "balance_loss_clip": 1.03186369, + "balance_loss_mlp": 1.00474131, + "epoch": 0.42349546083087836, + "flos": 20704297599360.0, + "grad_norm": 2.9115566617898327, + "language_loss": 0.72494692, + "learning_rate": 2.5841557114669135e-06, + "loss": 0.74706882, + "num_input_tokens_seen": 75906055, + "step": 3522, + "time_per_iteration": 2.7478997707366943 + }, + { + "auxiliary_loss_clip": 0.01146936, + "auxiliary_loss_mlp": 0.01086521, + "balance_loss_clip": 1.03336954, + "balance_loss_mlp": 1.00560224, + "epoch": 0.42361570372151747, + "flos": 18585065128320.0, + "grad_norm": 2.3604280532548874, + "language_loss": 0.6729908, + "learning_rate": 2.58341066294338e-06, + "loss": 0.69532537, + "num_input_tokens_seen": 75922720, + "step": 3523, + "time_per_iteration": 2.608060836791992 + }, + { + "auxiliary_loss_clip": 0.01108023, + "auxiliary_loss_mlp": 0.00873297, + "balance_loss_clip": 1.02981329, + "balance_loss_mlp": 1.00045907, + "epoch": 0.4237359466121566, + "flos": 20959478795520.0, + "grad_norm": 2.102458962429559, + "language_loss": 0.85103172, + "learning_rate": 2.5826655259169124e-06, + "loss": 0.8708449, + "num_input_tokens_seen": 75941375, + "step": 3524, + "time_per_iteration": 3.7767152786254883 + }, + { + "auxiliary_loss_clip": 0.01146989, + "auxiliary_loss_mlp": 0.0108668, + "balance_loss_clip": 1.03415298, + "balance_loss_mlp": 1.00595164, + "epoch": 0.42385618950279563, + "flos": 18038181582720.0, + "grad_norm": 1.6824290511135203, + "language_loss": 0.90431988, + "learning_rate": 2.5819203005005475e-06, + "loss": 0.92665654, + "num_input_tokens_seen": 75958710, + "step": 3525, + "time_per_iteration": 2.571030855178833 + }, + { + "auxiliary_loss_clip": 0.01115252, + "auxiliary_loss_mlp": 0.01085768, + "balance_loss_clip": 1.02676487, + "balance_loss_mlp": 1.00513446, + "epoch": 0.42397643239343474, + "flos": 23769129559680.0, + "grad_norm": 1.5398697110831288, + "language_loss": 0.78981006, + "learning_rate": 2.581174986807336e-06, + "loss": 0.81182027, + "num_input_tokens_seen": 75978945, + "step": 3526, + "time_per_iteration": 2.7406280040740967 + }, + { + "auxiliary_loss_clip": 0.01137086, + "auxiliary_loss_mlp": 0.00873327, + "balance_loss_clip": 1.03241539, + "balance_loss_mlp": 1.00040591, + "epoch": 0.42409667528407385, + "flos": 16545088016640.0, + "grad_norm": 2.185981622476728, + "language_loss": 0.91428661, + "learning_rate": 2.580429584950341e-06, + "loss": 0.93439078, + "num_input_tokens_seen": 75994695, + "step": 3527, + "time_per_iteration": 2.663384437561035 + }, + { + "auxiliary_loss_clip": 0.01098583, + "auxiliary_loss_mlp": 0.01085158, + "balance_loss_clip": 1.02827394, + "balance_loss_mlp": 1.00428665, + "epoch": 0.4242169181747129, + "flos": 16034186920320.0, + "grad_norm": 2.080147573505655, + "language_loss": 0.6656912, + "learning_rate": 2.5796840950426397e-06, + "loss": 0.68752861, + "num_input_tokens_seen": 76011780, + "step": 3528, + "time_per_iteration": 3.6208863258361816 + }, + { + "auxiliary_loss_clip": 0.01138351, + "auxiliary_loss_mlp": 0.01084999, + "balance_loss_clip": 1.03343916, + "balance_loss_mlp": 1.00427055, + "epoch": 0.424337161065352, + "flos": 20084012611200.0, + "grad_norm": 1.9151174244995919, + "language_loss": 0.65753919, + "learning_rate": 2.578938517197322e-06, + "loss": 0.67977262, + "num_input_tokens_seen": 76029875, + "step": 3529, + "time_per_iteration": 3.5450620651245117 + }, + { + "auxiliary_loss_clip": 0.01126837, + "auxiliary_loss_mlp": 0.01085859, + "balance_loss_clip": 1.03144252, + "balance_loss_mlp": 1.00513089, + "epoch": 0.4244574039559911, + "flos": 23878369797120.0, + "grad_norm": 2.2373273517430112, + "language_loss": 0.62896478, + "learning_rate": 2.5781928515274916e-06, + "loss": 0.65109175, + "num_input_tokens_seen": 76048595, + "step": 3530, + "time_per_iteration": 2.7678749561309814 + }, + { + "auxiliary_loss_clip": 0.01136636, + "auxiliary_loss_mlp": 0.0108643, + "balance_loss_clip": 1.03251171, + "balance_loss_mlp": 1.00574899, + "epoch": 0.4245776468466302, + "flos": 17565920542080.0, + "grad_norm": 1.9928756665749339, + "language_loss": 0.67834944, + "learning_rate": 2.577447098146265e-06, + "loss": 0.70058012, + "num_input_tokens_seen": 76065770, + "step": 3531, + "time_per_iteration": 2.7231156826019287 + }, + { + "auxiliary_loss_clip": 0.01116425, + "auxiliary_loss_mlp": 0.01086664, + "balance_loss_clip": 1.02968919, + "balance_loss_mlp": 1.00607896, + "epoch": 0.4246978897372693, + "flos": 27776256958080.0, + "grad_norm": 1.5341872486168127, + "language_loss": 0.78938681, + "learning_rate": 2.5767012571667724e-06, + "loss": 0.8114177, + "num_input_tokens_seen": 76085250, + "step": 3532, + "time_per_iteration": 2.9064781665802 + }, + { + "auxiliary_loss_clip": 0.01136055, + "auxiliary_loss_mlp": 0.01085421, + "balance_loss_clip": 1.03086114, + "balance_loss_mlp": 1.00464487, + "epoch": 0.42481813262790835, + "flos": 15596615439360.0, + "grad_norm": 1.9203174252506752, + "language_loss": 0.68525195, + "learning_rate": 2.5759553287021587e-06, + "loss": 0.70746672, + "num_input_tokens_seen": 76103580, + "step": 3533, + "time_per_iteration": 2.715196371078491 + }, + { + "auxiliary_loss_clip": 0.011223, + "auxiliary_loss_mlp": 0.01085973, + "balance_loss_clip": 1.02862239, + "balance_loss_mlp": 1.00519717, + "epoch": 0.42493837551854746, + "flos": 23951088881280.0, + "grad_norm": 1.8185552492529435, + "language_loss": 0.77082795, + "learning_rate": 2.5752093128655786e-06, + "loss": 0.79291064, + "num_input_tokens_seen": 76121825, + "step": 3534, + "time_per_iteration": 2.7895288467407227 + }, + { + "auxiliary_loss_clip": 0.01125516, + "auxiliary_loss_mlp": 0.0108575, + "balance_loss_clip": 1.03037167, + "balance_loss_mlp": 1.00502098, + "epoch": 0.4250586184091866, + "flos": 20813466009600.0, + "grad_norm": 1.7857646975322667, + "language_loss": 0.73856616, + "learning_rate": 2.574463209770204e-06, + "loss": 0.76067889, + "num_input_tokens_seen": 76141140, + "step": 3535, + "time_per_iteration": 2.727931022644043 + }, + { + "auxiliary_loss_clip": 0.01113341, + "auxiliary_loss_mlp": 0.01086427, + "balance_loss_clip": 1.02762604, + "balance_loss_mlp": 1.00569808, + "epoch": 0.42517886129982563, + "flos": 30371018607360.0, + "grad_norm": 1.5656143536339586, + "language_loss": 0.79261643, + "learning_rate": 2.5737170195292165e-06, + "loss": 0.81461418, + "num_input_tokens_seen": 76164475, + "step": 3536, + "time_per_iteration": 2.8944766521453857 + }, + { + "auxiliary_loss_clip": 0.01117308, + "auxiliary_loss_mlp": 0.01085651, + "balance_loss_clip": 1.02910829, + "balance_loss_mlp": 1.00477982, + "epoch": 0.42529910419046474, + "flos": 20080636732800.0, + "grad_norm": 2.894217800550263, + "language_loss": 0.77580345, + "learning_rate": 2.572970742255814e-06, + "loss": 0.79783309, + "num_input_tokens_seen": 76182965, + "step": 3537, + "time_per_iteration": 2.7673707008361816 + }, + { + "auxiliary_loss_clip": 0.01135054, + "auxiliary_loss_mlp": 0.01086589, + "balance_loss_clip": 1.03162766, + "balance_loss_mlp": 1.00590849, + "epoch": 0.42541934708110385, + "flos": 22632448694400.0, + "grad_norm": 1.6876963660349988, + "language_loss": 0.8137657, + "learning_rate": 2.5722243780632046e-06, + "loss": 0.83598208, + "num_input_tokens_seen": 76201230, + "step": 3538, + "time_per_iteration": 2.700496196746826 + }, + { + "auxiliary_loss_clip": 0.0110209, + "auxiliary_loss_mlp": 0.01079546, + "balance_loss_clip": 1.0370698, + "balance_loss_mlp": 1.00019991, + "epoch": 0.4255395899717429, + "flos": 66200676186240.0, + "grad_norm": 0.7522638656158874, + "language_loss": 0.60437125, + "learning_rate": 2.5714779270646125e-06, + "loss": 0.62618762, + "num_input_tokens_seen": 76262000, + "step": 3539, + "time_per_iteration": 3.403290033340454 + }, + { + "auxiliary_loss_clip": 0.01124074, + "auxiliary_loss_mlp": 0.00873342, + "balance_loss_clip": 1.02929747, + "balance_loss_mlp": 1.00032771, + "epoch": 0.425659832862382, + "flos": 17931814433280.0, + "grad_norm": 3.0630761436348553, + "language_loss": 0.77641833, + "learning_rate": 2.5707313893732735e-06, + "loss": 0.79639244, + "num_input_tokens_seen": 76280540, + "step": 3540, + "time_per_iteration": 2.7196319103240967 + }, + { + "auxiliary_loss_clip": 0.01088763, + "auxiliary_loss_mlp": 0.01084882, + "balance_loss_clip": 1.02714324, + "balance_loss_mlp": 1.00424862, + "epoch": 0.4257800757530211, + "flos": 24022550989440.0, + "grad_norm": 1.9024609735589229, + "language_loss": 0.77017784, + "learning_rate": 2.5699847651024364e-06, + "loss": 0.79191434, + "num_input_tokens_seen": 76301180, + "step": 3541, + "time_per_iteration": 3.1151647567749023 + }, + { + "auxiliary_loss_clip": 0.01132838, + "auxiliary_loss_mlp": 0.01084871, + "balance_loss_clip": 1.03057325, + "balance_loss_mlp": 1.00418997, + "epoch": 0.4259003186436602, + "flos": 23696015425920.0, + "grad_norm": 2.0202207235752905, + "language_loss": 0.76769096, + "learning_rate": 2.5692380543653627e-06, + "loss": 0.78986806, + "num_input_tokens_seen": 76319335, + "step": 3542, + "time_per_iteration": 2.837221622467041 + }, + { + "auxiliary_loss_clip": 0.01137914, + "auxiliary_loss_mlp": 0.00873274, + "balance_loss_clip": 1.03330541, + "balance_loss_mlp": 1.00032067, + "epoch": 0.4260205615342993, + "flos": 15259772672640.0, + "grad_norm": 1.9543800088328556, + "language_loss": 0.70007694, + "learning_rate": 2.5684912572753293e-06, + "loss": 0.72018886, + "num_input_tokens_seen": 76335010, + "step": 3543, + "time_per_iteration": 2.7244718074798584 + }, + { + "auxiliary_loss_clip": 0.01142887, + "auxiliary_loss_mlp": 0.01083731, + "balance_loss_clip": 1.03086376, + "balance_loss_mlp": 1.00319326, + "epoch": 0.4261408044249384, + "flos": 30665306736000.0, + "grad_norm": 1.629244861252447, + "language_loss": 0.8359288, + "learning_rate": 2.5677443739456245e-06, + "loss": 0.85819495, + "num_input_tokens_seen": 76356670, + "step": 3544, + "time_per_iteration": 3.482238531112671 + }, + { + "auxiliary_loss_clip": 0.0112011, + "auxiliary_loss_mlp": 0.01085562, + "balance_loss_clip": 1.02721775, + "balance_loss_mlp": 1.00497627, + "epoch": 0.42626104731557746, + "flos": 23257905240960.0, + "grad_norm": 2.4477167736970227, + "language_loss": 0.79523361, + "learning_rate": 2.5669974044895495e-06, + "loss": 0.81729031, + "num_input_tokens_seen": 76373065, + "step": 3545, + "time_per_iteration": 2.6520206928253174 + }, + { + "auxiliary_loss_clip": 0.01119158, + "auxiliary_loss_mlp": 0.0108731, + "balance_loss_clip": 1.03104591, + "balance_loss_mlp": 1.00662935, + "epoch": 0.42638129020621657, + "flos": 25884770670720.0, + "grad_norm": 1.6455429926308465, + "language_loss": 0.79192775, + "learning_rate": 2.5662503490204187e-06, + "loss": 0.8139925, + "num_input_tokens_seen": 76393230, + "step": 3546, + "time_per_iteration": 2.7643094062805176 + }, + { + "auxiliary_loss_clip": 0.01125039, + "auxiliary_loss_mlp": 0.01085829, + "balance_loss_clip": 1.0299834, + "balance_loss_mlp": 1.00524318, + "epoch": 0.4265015330968556, + "flos": 26502362138880.0, + "grad_norm": 2.0827733135516566, + "language_loss": 0.75766146, + "learning_rate": 2.5655032076515603e-06, + "loss": 0.77977014, + "num_input_tokens_seen": 76412555, + "step": 3547, + "time_per_iteration": 2.8047802448272705 + }, + { + "auxiliary_loss_clip": 0.01108506, + "auxiliary_loss_mlp": 0.01085951, + "balance_loss_clip": 1.03042412, + "balance_loss_mlp": 1.00531769, + "epoch": 0.42662177598749473, + "flos": 24389522288640.0, + "grad_norm": 3.5388308489167284, + "language_loss": 0.82043165, + "learning_rate": 2.5647559804963155e-06, + "loss": 0.84237623, + "num_input_tokens_seen": 76432485, + "step": 3548, + "time_per_iteration": 2.766749143600464 + }, + { + "auxiliary_loss_clip": 0.01107681, + "auxiliary_loss_mlp": 0.01086086, + "balance_loss_clip": 1.02867866, + "balance_loss_mlp": 1.00535738, + "epoch": 0.42674201887813384, + "flos": 23148629089920.0, + "grad_norm": 1.7392439189891413, + "language_loss": 0.78726888, + "learning_rate": 2.5640086676680364e-06, + "loss": 0.8092066, + "num_input_tokens_seen": 76453980, + "step": 3549, + "time_per_iteration": 3.987739324569702 + }, + { + "auxiliary_loss_clip": 0.01133563, + "auxiliary_loss_mlp": 0.01085744, + "balance_loss_clip": 1.0300988, + "balance_loss_mlp": 1.00492001, + "epoch": 0.4268622617687729, + "flos": 21689614552320.0, + "grad_norm": 2.269040252280733, + "language_loss": 0.80900824, + "learning_rate": 2.5632612692800923e-06, + "loss": 0.83120131, + "num_input_tokens_seen": 76473045, + "step": 3550, + "time_per_iteration": 2.6962454319000244 + }, + { + "auxiliary_loss_clip": 0.01114797, + "auxiliary_loss_mlp": 0.01086118, + "balance_loss_clip": 1.02926493, + "balance_loss_mlp": 1.00519919, + "epoch": 0.426982504659412, + "flos": 23440151871360.0, + "grad_norm": 2.0541001683028166, + "language_loss": 0.75132209, + "learning_rate": 2.5625137854458603e-06, + "loss": 0.77333128, + "num_input_tokens_seen": 76492060, + "step": 3551, + "time_per_iteration": 2.8168277740478516 + }, + { + "auxiliary_loss_clip": 0.01126476, + "auxiliary_loss_mlp": 0.01085526, + "balance_loss_clip": 1.03076136, + "balance_loss_mlp": 1.00494027, + "epoch": 0.4271027475500511, + "flos": 18916556768640.0, + "grad_norm": 1.8560481483062556, + "language_loss": 0.80213821, + "learning_rate": 2.561766216278735e-06, + "loss": 0.82425821, + "num_input_tokens_seen": 76509655, + "step": 3552, + "time_per_iteration": 2.8136467933654785 + }, + { + "auxiliary_loss_clip": 0.01101841, + "auxiliary_loss_mlp": 0.01084526, + "balance_loss_clip": 1.02617812, + "balance_loss_mlp": 1.00384474, + "epoch": 0.4272229904406902, + "flos": 26870554500480.0, + "grad_norm": 1.7717649814003462, + "language_loss": 0.81646895, + "learning_rate": 2.561018561892121e-06, + "loss": 0.83833259, + "num_input_tokens_seen": 76528795, + "step": 3553, + "time_per_iteration": 3.8327834606170654 + }, + { + "auxiliary_loss_clip": 0.01123556, + "auxiliary_loss_mlp": 0.01085355, + "balance_loss_clip": 1.02894998, + "balance_loss_mlp": 1.00476944, + "epoch": 0.4273432333313293, + "flos": 23951376190080.0, + "grad_norm": 1.5923441168892245, + "language_loss": 0.76831591, + "learning_rate": 2.5602708223994363e-06, + "loss": 0.79040504, + "num_input_tokens_seen": 76550660, + "step": 3554, + "time_per_iteration": 3.688385248184204 + }, + { + "auxiliary_loss_clip": 0.01118278, + "auxiliary_loss_mlp": 0.01084705, + "balance_loss_clip": 1.02985573, + "balance_loss_mlp": 1.00402391, + "epoch": 0.4274634762219684, + "flos": 29570354496000.0, + "grad_norm": 2.3445519910834265, + "language_loss": 0.67405558, + "learning_rate": 2.559522997914115e-06, + "loss": 0.69608539, + "num_input_tokens_seen": 76570240, + "step": 3555, + "time_per_iteration": 2.7959039211273193 + }, + { + "auxiliary_loss_clip": 0.01146543, + "auxiliary_loss_mlp": 0.01085164, + "balance_loss_clip": 1.03445089, + "balance_loss_mlp": 1.00457835, + "epoch": 0.42758371911260745, + "flos": 21434146047360.0, + "grad_norm": 2.2887988859042947, + "language_loss": 0.84604192, + "learning_rate": 2.558775088549599e-06, + "loss": 0.86835897, + "num_input_tokens_seen": 76589820, + "step": 3556, + "time_per_iteration": 2.704115867614746 + }, + { + "auxiliary_loss_clip": 0.01120184, + "auxiliary_loss_mlp": 0.01085188, + "balance_loss_clip": 1.03138924, + "balance_loss_mlp": 1.00441217, + "epoch": 0.42770396200324656, + "flos": 14752822072320.0, + "grad_norm": 2.2458910894208324, + "language_loss": 0.66434491, + "learning_rate": 2.5580270944193467e-06, + "loss": 0.68639857, + "num_input_tokens_seen": 76606640, + "step": 3557, + "time_per_iteration": 2.6365578174591064 + }, + { + "auxiliary_loss_clip": 0.01137319, + "auxiliary_loss_mlp": 0.01079516, + "balance_loss_clip": 1.04018557, + "balance_loss_mlp": 1.00016999, + "epoch": 0.4278242048938857, + "flos": 70654712601600.0, + "grad_norm": 0.7666979836497609, + "language_loss": 0.55521697, + "learning_rate": 2.557279015636827e-06, + "loss": 0.57738531, + "num_input_tokens_seen": 76667050, + "step": 3558, + "time_per_iteration": 3.211469888687134 + }, + { + "auxiliary_loss_clip": 0.01127643, + "auxiliary_loss_mlp": 0.01079611, + "balance_loss_clip": 1.03891802, + "balance_loss_mlp": 1.00026548, + "epoch": 0.42794444778452473, + "flos": 69366165033600.0, + "grad_norm": 0.7673392831006254, + "language_loss": 0.61231881, + "learning_rate": 2.5565308523155245e-06, + "loss": 0.63439137, + "num_input_tokens_seen": 76726650, + "step": 3559, + "time_per_iteration": 3.210296392440796 + }, + { + "auxiliary_loss_clip": 0.01104885, + "auxiliary_loss_mlp": 0.01085295, + "balance_loss_clip": 1.0290699, + "balance_loss_mlp": 1.00451887, + "epoch": 0.42806469067516384, + "flos": 18215328481920.0, + "grad_norm": 3.0093658940932584, + "language_loss": 0.82562357, + "learning_rate": 2.5557826045689336e-06, + "loss": 0.84752542, + "num_input_tokens_seen": 76742890, + "step": 3560, + "time_per_iteration": 2.794332504272461 + }, + { + "auxiliary_loss_clip": 0.01102872, + "auxiliary_loss_mlp": 0.01080339, + "balance_loss_clip": 1.03069687, + "balance_loss_mlp": 1.00099313, + "epoch": 0.4281849335658029, + "flos": 54535814432640.0, + "grad_norm": 0.8254497332346823, + "language_loss": 0.58903766, + "learning_rate": 2.5550342725105643e-06, + "loss": 0.61086977, + "num_input_tokens_seen": 76801055, + "step": 3561, + "time_per_iteration": 3.3071131706237793 + }, + { + "auxiliary_loss_clip": 0.01133981, + "auxiliary_loss_mlp": 0.01085702, + "balance_loss_clip": 1.03173709, + "balance_loss_mlp": 1.00506902, + "epoch": 0.428305176456442, + "flos": 17274828723840.0, + "grad_norm": 1.6074689507762507, + "language_loss": 0.81080091, + "learning_rate": 2.554285856253937e-06, + "loss": 0.83299774, + "num_input_tokens_seen": 76819890, + "step": 3562, + "time_per_iteration": 2.7318010330200195 + }, + { + "auxiliary_loss_clip": 0.01124028, + "auxiliary_loss_mlp": 0.01086129, + "balance_loss_clip": 1.03000689, + "balance_loss_mlp": 1.0054481, + "epoch": 0.4284254193470811, + "flos": 26359509749760.0, + "grad_norm": 1.914895570919295, + "language_loss": 0.77149063, + "learning_rate": 2.5535373559125855e-06, + "loss": 0.79359221, + "num_input_tokens_seen": 76840255, + "step": 3563, + "time_per_iteration": 2.802586317062378 + }, + { + "auxiliary_loss_clip": 0.01087777, + "auxiliary_loss_mlp": 0.01086037, + "balance_loss_clip": 1.02582908, + "balance_loss_mlp": 1.0052135, + "epoch": 0.42854566223772017, + "flos": 29714248379520.0, + "grad_norm": 1.5161080016071722, + "language_loss": 0.81941557, + "learning_rate": 2.552788771600057e-06, + "loss": 0.84115368, + "num_input_tokens_seen": 76860565, + "step": 3564, + "time_per_iteration": 2.9653677940368652 + }, + { + "auxiliary_loss_clip": 0.01117406, + "auxiliary_loss_mlp": 0.01086832, + "balance_loss_clip": 1.03127921, + "balance_loss_mlp": 1.00610316, + "epoch": 0.4286659051283593, + "flos": 22018161277440.0, + "grad_norm": 2.0451478136298293, + "language_loss": 0.81950307, + "learning_rate": 2.5520401034299118e-06, + "loss": 0.84154546, + "num_input_tokens_seen": 76878325, + "step": 3565, + "time_per_iteration": 2.824338912963867 + }, + { + "auxiliary_loss_clip": 0.01133691, + "auxiliary_loss_mlp": 0.01085014, + "balance_loss_clip": 1.03084159, + "balance_loss_mlp": 1.0040946, + "epoch": 0.4287861480189984, + "flos": 13334422838400.0, + "grad_norm": 2.3291557927760693, + "language_loss": 0.87778085, + "learning_rate": 2.551291351515722e-06, + "loss": 0.89996791, + "num_input_tokens_seen": 76895340, + "step": 3566, + "time_per_iteration": 2.695267915725708 + }, + { + "auxiliary_loss_clip": 0.01115831, + "auxiliary_loss_mlp": 0.00873257, + "balance_loss_clip": 1.0285064, + "balance_loss_mlp": 1.00033224, + "epoch": 0.42890639090963745, + "flos": 26651535321600.0, + "grad_norm": 1.5741939265833813, + "language_loss": 0.85839331, + "learning_rate": 2.5505425159710726e-06, + "loss": 0.87828422, + "num_input_tokens_seen": 76915150, + "step": 3567, + "time_per_iteration": 2.8348448276519775 + }, + { + "auxiliary_loss_clip": 0.0112771, + "auxiliary_loss_mlp": 0.00873178, + "balance_loss_clip": 1.03106558, + "balance_loss_mlp": 1.00026059, + "epoch": 0.42902663380027656, + "flos": 24055768091520.0, + "grad_norm": 1.8310411034519065, + "language_loss": 0.82949936, + "learning_rate": 2.549793596909561e-06, + "loss": 0.84950829, + "num_input_tokens_seen": 76933770, + "step": 3568, + "time_per_iteration": 2.7965316772460938 + }, + { + "auxiliary_loss_clip": 0.01118945, + "auxiliary_loss_mlp": 0.01084394, + "balance_loss_clip": 1.02664781, + "balance_loss_mlp": 1.00376046, + "epoch": 0.42914687669091567, + "flos": 15632561975040.0, + "grad_norm": 1.9188749581274733, + "language_loss": 0.6634028, + "learning_rate": 2.5490445944447976e-06, + "loss": 0.68543619, + "num_input_tokens_seen": 76952265, + "step": 3569, + "time_per_iteration": 3.5436768531799316 + }, + { + "auxiliary_loss_clip": 0.01133906, + "auxiliary_loss_mlp": 0.01085502, + "balance_loss_clip": 1.03121758, + "balance_loss_mlp": 1.0048691, + "epoch": 0.4292671195815547, + "flos": 31467802440960.0, + "grad_norm": 1.8150530116028973, + "language_loss": 0.65287834, + "learning_rate": 2.548295508690406e-06, + "loss": 0.67507243, + "num_input_tokens_seen": 76973560, + "step": 3570, + "time_per_iteration": 2.7902002334594727 + }, + { + "auxiliary_loss_clip": 0.01134684, + "auxiliary_loss_mlp": 0.01086423, + "balance_loss_clip": 1.03037572, + "balance_loss_mlp": 1.00569487, + "epoch": 0.42938736247219383, + "flos": 30257756046720.0, + "grad_norm": 3.13651184442932, + "language_loss": 0.76481724, + "learning_rate": 2.5475463397600217e-06, + "loss": 0.78702831, + "num_input_tokens_seen": 76993640, + "step": 3571, + "time_per_iteration": 2.808903217315674 + }, + { + "auxiliary_loss_clip": 0.01144302, + "auxiliary_loss_mlp": 0.01086444, + "balance_loss_clip": 1.03193486, + "balance_loss_mlp": 1.00571537, + "epoch": 0.42950760536283294, + "flos": 29349683291520.0, + "grad_norm": 1.8725468602693307, + "language_loss": 0.77350903, + "learning_rate": 2.546797087767293e-06, + "loss": 0.79581648, + "num_input_tokens_seen": 77013765, + "step": 3572, + "time_per_iteration": 2.68841290473938 + }, + { + "auxiliary_loss_clip": 0.01094077, + "auxiliary_loss_mlp": 0.0108626, + "balance_loss_clip": 1.02376032, + "balance_loss_mlp": 1.00557947, + "epoch": 0.429627848253472, + "flos": 26869943969280.0, + "grad_norm": 1.6240709880894422, + "language_loss": 0.87060046, + "learning_rate": 2.546047752825881e-06, + "loss": 0.89240384, + "num_input_tokens_seen": 77034370, + "step": 3573, + "time_per_iteration": 2.9000744819641113 + }, + { + "auxiliary_loss_clip": 0.01106608, + "auxiliary_loss_mlp": 0.0108409, + "balance_loss_clip": 1.02836168, + "balance_loss_mlp": 1.00345731, + "epoch": 0.4297480911441111, + "flos": 13881270470400.0, + "grad_norm": 1.9345512107661011, + "language_loss": 0.93168825, + "learning_rate": 2.5452983350494595e-06, + "loss": 0.95359522, + "num_input_tokens_seen": 77049925, + "step": 3574, + "time_per_iteration": 2.8127877712249756 + }, + { + "auxiliary_loss_clip": 0.01135118, + "auxiliary_loss_mlp": 0.0087305, + "balance_loss_clip": 1.03128946, + "balance_loss_mlp": 1.00031567, + "epoch": 0.4298683340347502, + "flos": 20741141975040.0, + "grad_norm": 2.0139274488491914, + "language_loss": 0.65276814, + "learning_rate": 2.544548834551713e-06, + "loss": 0.67284989, + "num_input_tokens_seen": 77068930, + "step": 3575, + "time_per_iteration": 3.6140482425689697 + }, + { + "auxiliary_loss_clip": 0.01106645, + "auxiliary_loss_mlp": 0.00873255, + "balance_loss_clip": 1.02720714, + "balance_loss_mlp": 1.00027108, + "epoch": 0.4299885769253893, + "flos": 20882126856960.0, + "grad_norm": 2.2999673298492835, + "language_loss": 0.94652343, + "learning_rate": 2.5437992514463424e-06, + "loss": 0.96632242, + "num_input_tokens_seen": 77082255, + "step": 3576, + "time_per_iteration": 2.770287036895752 + }, + { + "auxiliary_loss_clip": 0.01133561, + "auxiliary_loss_mlp": 0.01085707, + "balance_loss_clip": 1.02963233, + "balance_loss_mlp": 1.0049789, + "epoch": 0.4301088198160284, + "flos": 25484618183040.0, + "grad_norm": 1.91739382467768, + "language_loss": 0.87685728, + "learning_rate": 2.5430495858470565e-06, + "loss": 0.89904994, + "num_input_tokens_seen": 77101725, + "step": 3577, + "time_per_iteration": 2.7376763820648193 + }, + { + "auxiliary_loss_clip": 0.01125295, + "auxiliary_loss_mlp": 0.01084381, + "balance_loss_clip": 1.02808714, + "balance_loss_mlp": 1.00370038, + "epoch": 0.43022906270666744, + "flos": 18259427404800.0, + "grad_norm": 4.414113149200247, + "language_loss": 0.77690709, + "learning_rate": 2.54229983786758e-06, + "loss": 0.79900384, + "num_input_tokens_seen": 77119670, + "step": 3578, + "time_per_iteration": 2.829277992248535 + }, + { + "auxiliary_loss_clip": 0.01126941, + "auxiliary_loss_mlp": 0.01084793, + "balance_loss_clip": 1.03080642, + "balance_loss_mlp": 1.00411248, + "epoch": 0.43034930559730655, + "flos": 23399536567680.0, + "grad_norm": 1.779584554284974, + "language_loss": 0.85005897, + "learning_rate": 2.541550007621651e-06, + "loss": 0.87217629, + "num_input_tokens_seen": 77138160, + "step": 3579, + "time_per_iteration": 3.837709426879883 + }, + { + "auxiliary_loss_clip": 0.01134634, + "auxiliary_loss_mlp": 0.01085888, + "balance_loss_clip": 1.0320307, + "balance_loss_mlp": 1.00520682, + "epoch": 0.43046954848794566, + "flos": 28184382264960.0, + "grad_norm": 3.4708482882956697, + "language_loss": 0.80286348, + "learning_rate": 2.5408000952230156e-06, + "loss": 0.82506865, + "num_input_tokens_seen": 77156950, + "step": 3580, + "time_per_iteration": 2.7405049800872803 + }, + { + "auxiliary_loss_clip": 0.01102476, + "auxiliary_loss_mlp": 0.01086451, + "balance_loss_clip": 1.02965117, + "balance_loss_mlp": 1.00553179, + "epoch": 0.4305897913785847, + "flos": 28580476515840.0, + "grad_norm": 1.8103848745294389, + "language_loss": 0.90332574, + "learning_rate": 2.5400501007854357e-06, + "loss": 0.92521501, + "num_input_tokens_seen": 77176395, + "step": 3581, + "time_per_iteration": 2.879805564880371 + }, + { + "auxiliary_loss_clip": 0.01106938, + "auxiliary_loss_mlp": 0.01085573, + "balance_loss_clip": 1.02809668, + "balance_loss_mlp": 1.00489199, + "epoch": 0.43071003426922383, + "flos": 20448721353600.0, + "grad_norm": 1.6431861770071339, + "language_loss": 0.75605905, + "learning_rate": 2.539300024422685e-06, + "loss": 0.7779842, + "num_input_tokens_seen": 77194340, + "step": 3582, + "time_per_iteration": 2.789390802383423 + }, + { + "auxiliary_loss_clip": 0.01092802, + "auxiliary_loss_mlp": 0.01079505, + "balance_loss_clip": 1.03653407, + "balance_loss_mlp": 1.00015926, + "epoch": 0.43083027715986294, + "flos": 51997969883520.0, + "grad_norm": 0.8992181525815692, + "language_loss": 0.60926563, + "learning_rate": 2.538549866248549e-06, + "loss": 0.63098872, + "num_input_tokens_seen": 77249320, + "step": 3583, + "time_per_iteration": 3.1982080936431885 + }, + { + "auxiliary_loss_clip": 0.01133437, + "auxiliary_loss_mlp": 0.01085055, + "balance_loss_clip": 1.03054976, + "balance_loss_mlp": 1.00432694, + "epoch": 0.430950520050502, + "flos": 16690885320960.0, + "grad_norm": 1.7593095979616145, + "language_loss": 0.80979627, + "learning_rate": 2.5377996263768274e-06, + "loss": 0.83198124, + "num_input_tokens_seen": 77267400, + "step": 3584, + "time_per_iteration": 2.716506242752075 + }, + { + "auxiliary_loss_clip": 0.01135634, + "auxiliary_loss_mlp": 0.01087008, + "balance_loss_clip": 1.03152299, + "balance_loss_mlp": 1.00627923, + "epoch": 0.4310707629411411, + "flos": 24608433726720.0, + "grad_norm": 1.7424513211080654, + "language_loss": 0.68799353, + "learning_rate": 2.5370493049213293e-06, + "loss": 0.71021998, + "num_input_tokens_seen": 77287045, + "step": 3585, + "time_per_iteration": 2.725335121154785 + }, + { + "auxiliary_loss_clip": 0.01063964, + "auxiliary_loss_mlp": 0.01084972, + "balance_loss_clip": 1.02169347, + "balance_loss_mlp": 1.00414824, + "epoch": 0.4311910058317802, + "flos": 26432983019520.0, + "grad_norm": 1.8120455953051822, + "language_loss": 0.8054744, + "learning_rate": 2.536298901995878e-06, + "loss": 0.82696378, + "num_input_tokens_seen": 77306255, + "step": 3586, + "time_per_iteration": 3.13420033454895 + }, + { + "auxiliary_loss_clip": 0.01124435, + "auxiliary_loss_mlp": 0.01085514, + "balance_loss_clip": 1.02964783, + "balance_loss_mlp": 1.0047375, + "epoch": 0.43131124872241927, + "flos": 25155891889920.0, + "grad_norm": 1.7142701446775201, + "language_loss": 0.80045986, + "learning_rate": 2.535548417714311e-06, + "loss": 0.82255936, + "num_input_tokens_seen": 77325555, + "step": 3587, + "time_per_iteration": 2.9610724449157715 + }, + { + "auxiliary_loss_clip": 0.0112007, + "auxiliary_loss_mlp": 0.01084761, + "balance_loss_clip": 1.03091872, + "balance_loss_mlp": 1.00403214, + "epoch": 0.4314314916130584, + "flos": 21614812479360.0, + "grad_norm": 1.6195319294944919, + "language_loss": 0.87397206, + "learning_rate": 2.534797852190474e-06, + "loss": 0.89602041, + "num_input_tokens_seen": 77345735, + "step": 3588, + "time_per_iteration": 2.7733840942382812 + }, + { + "auxiliary_loss_clip": 0.01136104, + "auxiliary_loss_mlp": 0.01086367, + "balance_loss_clip": 1.03171301, + "balance_loss_mlp": 1.00559068, + "epoch": 0.4315517345036975, + "flos": 19275016544640.0, + "grad_norm": 1.7571590851855785, + "language_loss": 0.81346202, + "learning_rate": 2.5340472055382283e-06, + "loss": 0.83568668, + "num_input_tokens_seen": 77361765, + "step": 3589, + "time_per_iteration": 2.6914587020874023 + }, + { + "auxiliary_loss_clip": 0.01117737, + "auxiliary_loss_mlp": 0.01085285, + "balance_loss_clip": 1.02979434, + "balance_loss_mlp": 1.00465155, + "epoch": 0.43167197739433655, + "flos": 24273853516800.0, + "grad_norm": 1.8889970414842234, + "language_loss": 0.81059819, + "learning_rate": 2.5332964778714468e-06, + "loss": 0.83262837, + "num_input_tokens_seen": 77378950, + "step": 3590, + "time_per_iteration": 2.8019425868988037 + }, + { + "auxiliary_loss_clip": 0.01113649, + "auxiliary_loss_mlp": 0.01086604, + "balance_loss_clip": 1.02927887, + "balance_loss_mlp": 1.00592303, + "epoch": 0.43179222028497566, + "flos": 16867816738560.0, + "grad_norm": 1.6684418388607294, + "language_loss": 0.6601311, + "learning_rate": 2.5325456693040123e-06, + "loss": 0.68213362, + "num_input_tokens_seen": 77396145, + "step": 3591, + "time_per_iteration": 2.7871575355529785 + }, + { + "auxiliary_loss_clip": 0.01121552, + "auxiliary_loss_mlp": 0.01084999, + "balance_loss_clip": 1.03216362, + "balance_loss_mlp": 1.0041275, + "epoch": 0.43191246317561477, + "flos": 17639214243840.0, + "grad_norm": 5.104808115767103, + "language_loss": 0.75015879, + "learning_rate": 2.531794779949824e-06, + "loss": 0.77222437, + "num_input_tokens_seen": 77414045, + "step": 3592, + "time_per_iteration": 2.86327862739563 + }, + { + "auxiliary_loss_clip": 0.01114873, + "auxiliary_loss_mlp": 0.01085187, + "balance_loss_clip": 1.02939653, + "balance_loss_mlp": 1.00460196, + "epoch": 0.4320327060662538, + "flos": 23878800760320.0, + "grad_norm": 1.7282285608119496, + "language_loss": 0.88204741, + "learning_rate": 2.5310438099227903e-06, + "loss": 0.90404809, + "num_input_tokens_seen": 77431310, + "step": 3593, + "time_per_iteration": 2.8847882747650146 + }, + { + "auxiliary_loss_clip": 0.01125708, + "auxiliary_loss_mlp": 0.01079436, + "balance_loss_clip": 1.03686488, + "balance_loss_mlp": 1.00009048, + "epoch": 0.43215294895689293, + "flos": 66394917959040.0, + "grad_norm": 0.8106938641713733, + "language_loss": 0.53440797, + "learning_rate": 2.530292759336833e-06, + "loss": 0.55645943, + "num_input_tokens_seen": 77492045, + "step": 3594, + "time_per_iteration": 3.336003065109253 + }, + { + "auxiliary_loss_clip": 0.0111868, + "auxiliary_loss_mlp": 0.01084306, + "balance_loss_clip": 1.02634144, + "balance_loss_mlp": 1.00352979, + "epoch": 0.432273191847532, + "flos": 20594267262720.0, + "grad_norm": 3.964143539252478, + "language_loss": 0.69544101, + "learning_rate": 2.5295416283058855e-06, + "loss": 0.71747088, + "num_input_tokens_seen": 77510910, + "step": 3595, + "time_per_iteration": 3.715693473815918 + }, + { + "auxiliary_loss_clip": 0.01128238, + "auxiliary_loss_mlp": 0.00873251, + "balance_loss_clip": 1.03226614, + "balance_loss_mlp": 1.00033927, + "epoch": 0.4323934347381711, + "flos": 19282127437440.0, + "grad_norm": 1.7848667716086692, + "language_loss": 0.66238576, + "learning_rate": 2.5287904169438943e-06, + "loss": 0.6824007, + "num_input_tokens_seen": 77530115, + "step": 3596, + "time_per_iteration": 2.6927740573883057 + }, + { + "auxiliary_loss_clip": 0.01087848, + "auxiliary_loss_mlp": 0.01086591, + "balance_loss_clip": 1.02718329, + "balance_loss_mlp": 1.0057199, + "epoch": 0.4325136776288102, + "flos": 21726315273600.0, + "grad_norm": 2.60873532646058, + "language_loss": 0.63964605, + "learning_rate": 2.528039125364817e-06, + "loss": 0.66139054, + "num_input_tokens_seen": 77548920, + "step": 3597, + "time_per_iteration": 2.9214963912963867 + }, + { + "auxiliary_loss_clip": 0.01112947, + "auxiliary_loss_mlp": 0.01085664, + "balance_loss_clip": 1.02755547, + "balance_loss_mlp": 1.00483978, + "epoch": 0.43263392051944927, + "flos": 22340746344960.0, + "grad_norm": 2.091118947039233, + "language_loss": 0.75503314, + "learning_rate": 2.5272877536826246e-06, + "loss": 0.7770192, + "num_input_tokens_seen": 77567715, + "step": 3598, + "time_per_iteration": 2.7590036392211914 + }, + { + "auxiliary_loss_clip": 0.01104223, + "auxiliary_loss_mlp": 0.01085122, + "balance_loss_clip": 1.02660465, + "balance_loss_mlp": 1.00439358, + "epoch": 0.4327541634100884, + "flos": 29168406328320.0, + "grad_norm": 2.468148210479386, + "language_loss": 0.70316488, + "learning_rate": 2.5265363020112986e-06, + "loss": 0.72505832, + "num_input_tokens_seen": 77588035, + "step": 3599, + "time_per_iteration": 2.896491050720215 + }, + { + "auxiliary_loss_clip": 0.01134161, + "auxiliary_loss_mlp": 0.01085704, + "balance_loss_clip": 1.03065586, + "balance_loss_mlp": 1.0049752, + "epoch": 0.4328744063007275, + "flos": 26067448264320.0, + "grad_norm": 2.6884218113772196, + "language_loss": 0.83936697, + "learning_rate": 2.5257847704648344e-06, + "loss": 0.86156559, + "num_input_tokens_seen": 77609265, + "step": 3600, + "time_per_iteration": 2.7516801357269287 + }, + { + "auxiliary_loss_clip": 0.01144383, + "auxiliary_loss_mlp": 0.010854, + "balance_loss_clip": 1.03202438, + "balance_loss_mlp": 1.00471902, + "epoch": 0.43299464919136654, + "flos": 16581357774720.0, + "grad_norm": 1.8313486741369631, + "language_loss": 0.75410897, + "learning_rate": 2.525033159157239e-06, + "loss": 0.77640676, + "num_input_tokens_seen": 77625580, + "step": 3601, + "time_per_iteration": 3.5174167156219482 + }, + { + "auxiliary_loss_clip": 0.01134712, + "auxiliary_loss_mlp": 0.01086582, + "balance_loss_clip": 1.03030574, + "balance_loss_mlp": 1.00566316, + "epoch": 0.43311489208200565, + "flos": 16107265140480.0, + "grad_norm": 1.6612333649137356, + "language_loss": 0.77252781, + "learning_rate": 2.52428146820253e-06, + "loss": 0.79474074, + "num_input_tokens_seen": 77643835, + "step": 3602, + "time_per_iteration": 2.6859307289123535 + }, + { + "auxiliary_loss_clip": 0.01113748, + "auxiliary_loss_mlp": 0.01084547, + "balance_loss_clip": 1.02885723, + "balance_loss_mlp": 1.00386643, + "epoch": 0.43323513497264476, + "flos": 22930220442240.0, + "grad_norm": 1.8969629573807432, + "language_loss": 0.81812435, + "learning_rate": 2.52352969771474e-06, + "loss": 0.84010732, + "num_input_tokens_seen": 77663060, + "step": 3603, + "time_per_iteration": 2.823154926300049 + }, + { + "auxiliary_loss_clip": 0.01125648, + "auxiliary_loss_mlp": 0.0108572, + "balance_loss_clip": 1.03029716, + "balance_loss_mlp": 1.00503945, + "epoch": 0.4333553778632838, + "flos": 25299031587840.0, + "grad_norm": 1.7746701620594991, + "language_loss": 0.88472855, + "learning_rate": 2.5227778478079106e-06, + "loss": 0.90684223, + "num_input_tokens_seen": 77682470, + "step": 3604, + "time_per_iteration": 3.6972768306732178 + }, + { + "auxiliary_loss_clip": 0.01136489, + "auxiliary_loss_mlp": 0.01085803, + "balance_loss_clip": 1.03256071, + "balance_loss_mlp": 1.00507426, + "epoch": 0.43347562075392293, + "flos": 19387165783680.0, + "grad_norm": 1.4890081128593793, + "language_loss": 0.7684375, + "learning_rate": 2.522025918596098e-06, + "loss": 0.79066044, + "num_input_tokens_seen": 77700770, + "step": 3605, + "time_per_iteration": 3.672499418258667 + }, + { + "auxiliary_loss_clip": 0.01120325, + "auxiliary_loss_mlp": 0.0108561, + "balance_loss_clip": 1.03239477, + "balance_loss_mlp": 1.00502419, + "epoch": 0.43359586364456204, + "flos": 26325969425280.0, + "grad_norm": 1.764352330626622, + "language_loss": 0.65880775, + "learning_rate": 2.521273910193368e-06, + "loss": 0.68086708, + "num_input_tokens_seen": 77723950, + "step": 3606, + "time_per_iteration": 2.7848260402679443 + }, + { + "auxiliary_loss_clip": 0.01136658, + "auxiliary_loss_mlp": 0.01085255, + "balance_loss_clip": 1.03203297, + "balance_loss_mlp": 1.00452662, + "epoch": 0.4337161065352011, + "flos": 15989261984640.0, + "grad_norm": 2.3821144511386816, + "language_loss": 0.86892784, + "learning_rate": 2.5205218227138006e-06, + "loss": 0.8911469, + "num_input_tokens_seen": 77736905, + "step": 3607, + "time_per_iteration": 2.705717086791992 + }, + { + "auxiliary_loss_clip": 0.01144824, + "auxiliary_loss_mlp": 0.01084816, + "balance_loss_clip": 1.03210616, + "balance_loss_mlp": 1.00418329, + "epoch": 0.4338363494258402, + "flos": 20224710184320.0, + "grad_norm": 1.7880158178853502, + "language_loss": 0.7907151, + "learning_rate": 2.519769656271486e-06, + "loss": 0.81301153, + "num_input_tokens_seen": 77754325, + "step": 3608, + "time_per_iteration": 2.686762809753418 + }, + { + "auxiliary_loss_clip": 0.01095097, + "auxiliary_loss_mlp": 0.01085297, + "balance_loss_clip": 1.0253315, + "balance_loss_mlp": 1.0045681, + "epoch": 0.43395659231647926, + "flos": 20083904870400.0, + "grad_norm": 1.8714503493315204, + "language_loss": 0.67242885, + "learning_rate": 2.5190174109805285e-06, + "loss": 0.69423276, + "num_input_tokens_seen": 77774150, + "step": 3609, + "time_per_iteration": 2.8285610675811768 + }, + { + "auxiliary_loss_clip": 0.01119885, + "auxiliary_loss_mlp": 0.0108534, + "balance_loss_clip": 1.02672052, + "balance_loss_mlp": 1.0046593, + "epoch": 0.43407683520711837, + "flos": 19901801894400.0, + "grad_norm": 1.733309091935139, + "language_loss": 0.63828647, + "learning_rate": 2.518265086955042e-06, + "loss": 0.66033876, + "num_input_tokens_seen": 77791870, + "step": 3610, + "time_per_iteration": 2.7169742584228516 + }, + { + "auxiliary_loss_clip": 0.01143257, + "auxiliary_loss_mlp": 0.01085074, + "balance_loss_clip": 1.03094757, + "balance_loss_mlp": 1.00439334, + "epoch": 0.4341970780977575, + "flos": 23108732058240.0, + "grad_norm": 1.7655725248761425, + "language_loss": 0.83726895, + "learning_rate": 2.5175126843091534e-06, + "loss": 0.85955226, + "num_input_tokens_seen": 77811240, + "step": 3611, + "time_per_iteration": 2.6267359256744385 + }, + { + "auxiliary_loss_clip": 0.01110516, + "auxiliary_loss_mlp": 0.01084671, + "balance_loss_clip": 1.03102994, + "balance_loss_mlp": 1.00403774, + "epoch": 0.43431732098839654, + "flos": 37408288406400.0, + "grad_norm": 3.006225251643094, + "language_loss": 0.75391912, + "learning_rate": 2.5167602031570034e-06, + "loss": 0.77587098, + "num_input_tokens_seen": 77831425, + "step": 3612, + "time_per_iteration": 2.82682728767395 + }, + { + "auxiliary_loss_clip": 0.01145305, + "auxiliary_loss_mlp": 0.01085544, + "balance_loss_clip": 1.03258836, + "balance_loss_mlp": 1.00495875, + "epoch": 0.43443756387903565, + "flos": 31868206323840.0, + "grad_norm": 1.5548249176227196, + "language_loss": 0.73424834, + "learning_rate": 2.51600764361274e-06, + "loss": 0.75655687, + "num_input_tokens_seen": 77852950, + "step": 3613, + "time_per_iteration": 2.7155447006225586 + }, + { + "auxiliary_loss_clip": 0.0114474, + "auxiliary_loss_mlp": 0.0108501, + "balance_loss_clip": 1.03271925, + "balance_loss_mlp": 1.00432932, + "epoch": 0.43455780676967476, + "flos": 23477139901440.0, + "grad_norm": 3.391674847120966, + "language_loss": 0.78822845, + "learning_rate": 2.5152550057905283e-06, + "loss": 0.81052595, + "num_input_tokens_seen": 77872840, + "step": 3614, + "time_per_iteration": 2.6985855102539062 + }, + { + "auxiliary_loss_clip": 0.01132232, + "auxiliary_loss_mlp": 0.00873266, + "balance_loss_clip": 1.02939224, + "balance_loss_mlp": 1.00024939, + "epoch": 0.4346780496603138, + "flos": 24207060176640.0, + "grad_norm": 2.4028025005703753, + "language_loss": 0.77428705, + "learning_rate": 2.5145022898045415e-06, + "loss": 0.79434204, + "num_input_tokens_seen": 77892025, + "step": 3615, + "time_per_iteration": 2.7344512939453125 + }, + { + "auxiliary_loss_clip": 0.01111266, + "auxiliary_loss_mlp": 0.01085975, + "balance_loss_clip": 1.03030717, + "balance_loss_mlp": 1.00515163, + "epoch": 0.4347982925509529, + "flos": 17092366611840.0, + "grad_norm": 2.07816860566105, + "language_loss": 0.89655089, + "learning_rate": 2.5137494957689664e-06, + "loss": 0.91852331, + "num_input_tokens_seen": 77907635, + "step": 3616, + "time_per_iteration": 2.681497573852539 + }, + { + "auxiliary_loss_clip": 0.01117903, + "auxiliary_loss_mlp": 0.01079964, + "balance_loss_clip": 1.03781033, + "balance_loss_mlp": 1.00061786, + "epoch": 0.43491853544159204, + "flos": 60945544696320.0, + "grad_norm": 0.7667235321654294, + "language_loss": 0.57369673, + "learning_rate": 2.5129966237980016e-06, + "loss": 0.59567547, + "num_input_tokens_seen": 77970630, + "step": 3617, + "time_per_iteration": 3.3159656524658203 + }, + { + "auxiliary_loss_clip": 0.01116678, + "auxiliary_loss_mlp": 0.01085869, + "balance_loss_clip": 1.02990472, + "balance_loss_mlp": 1.00509298, + "epoch": 0.4350387783322311, + "flos": 21944652094080.0, + "grad_norm": 1.7354971897950284, + "language_loss": 0.78209502, + "learning_rate": 2.512243674005857e-06, + "loss": 0.80412048, + "num_input_tokens_seen": 77989995, + "step": 3618, + "time_per_iteration": 2.7679765224456787 + }, + { + "auxiliary_loss_clip": 0.01101284, + "auxiliary_loss_mlp": 0.01085589, + "balance_loss_clip": 1.02998435, + "balance_loss_mlp": 1.00486076, + "epoch": 0.4351590212228702, + "flos": 25082705928960.0, + "grad_norm": 1.8057964385741387, + "language_loss": 0.86389434, + "learning_rate": 2.5114906465067537e-06, + "loss": 0.88576311, + "num_input_tokens_seen": 78010980, + "step": 3619, + "time_per_iteration": 2.896014928817749 + }, + { + "auxiliary_loss_clip": 0.01136484, + "auxiliary_loss_mlp": 0.01084737, + "balance_loss_clip": 1.03152478, + "balance_loss_mlp": 1.00405645, + "epoch": 0.4352792641135093, + "flos": 21506541909120.0, + "grad_norm": 2.1826001352895146, + "language_loss": 0.74959373, + "learning_rate": 2.5107375414149264e-06, + "loss": 0.771806, + "num_input_tokens_seen": 78030225, + "step": 3620, + "time_per_iteration": 2.7451796531677246 + }, + { + "auxiliary_loss_clip": 0.01105649, + "auxiliary_loss_mlp": 0.01085179, + "balance_loss_clip": 1.02759743, + "balance_loss_mlp": 1.00445104, + "epoch": 0.43539950700414837, + "flos": 16253457494400.0, + "grad_norm": 2.0696475013680864, + "language_loss": 0.7147938, + "learning_rate": 2.5099843588446197e-06, + "loss": 0.73670208, + "num_input_tokens_seen": 78048545, + "step": 3621, + "time_per_iteration": 3.761251926422119 + }, + { + "auxiliary_loss_clip": 0.01089003, + "auxiliary_loss_mlp": 0.0108594, + "balance_loss_clip": 1.02775645, + "balance_loss_mlp": 1.00525928, + "epoch": 0.4355197498947875, + "flos": 16691819074560.0, + "grad_norm": 1.5655437405717751, + "language_loss": 0.61425859, + "learning_rate": 2.509231098910091e-06, + "loss": 0.63600802, + "num_input_tokens_seen": 78068415, + "step": 3622, + "time_per_iteration": 2.904452085494995 + }, + { + "auxiliary_loss_clip": 0.01119348, + "auxiliary_loss_mlp": 0.01085095, + "balance_loss_clip": 1.02760708, + "balance_loss_mlp": 1.00441372, + "epoch": 0.4356399927854266, + "flos": 16362733645440.0, + "grad_norm": 2.092156566451648, + "language_loss": 0.74912822, + "learning_rate": 2.508477761725611e-06, + "loss": 0.7711727, + "num_input_tokens_seen": 78086690, + "step": 3623, + "time_per_iteration": 2.719359874725342 + }, + { + "auxiliary_loss_clip": 0.01133778, + "auxiliary_loss_mlp": 0.01086041, + "balance_loss_clip": 1.02942991, + "balance_loss_mlp": 1.00526512, + "epoch": 0.43576023567606564, + "flos": 17202037812480.0, + "grad_norm": 1.8813203827458698, + "language_loss": 0.80596936, + "learning_rate": 2.507724347405458e-06, + "loss": 0.8281675, + "num_input_tokens_seen": 78104640, + "step": 3624, + "time_per_iteration": 2.688469409942627 + }, + { + "auxiliary_loss_clip": 0.01108573, + "auxiliary_loss_mlp": 0.01084763, + "balance_loss_clip": 1.02978206, + "balance_loss_mlp": 1.00408196, + "epoch": 0.43588047856670475, + "flos": 15917656222080.0, + "grad_norm": 4.050502449919172, + "language_loss": 0.82496917, + "learning_rate": 2.5069708560639243e-06, + "loss": 0.84690261, + "num_input_tokens_seen": 78122550, + "step": 3625, + "time_per_iteration": 2.8482935428619385 + }, + { + "auxiliary_loss_clip": 0.01117253, + "auxiliary_loss_mlp": 0.01086603, + "balance_loss_clip": 1.02980256, + "balance_loss_mlp": 1.00582671, + "epoch": 0.4360007214573438, + "flos": 23659566099840.0, + "grad_norm": 2.400342320199254, + "language_loss": 0.61572039, + "learning_rate": 2.5062172878153158e-06, + "loss": 0.63775897, + "num_input_tokens_seen": 78141825, + "step": 3626, + "time_per_iteration": 3.7062928676605225 + }, + { + "auxiliary_loss_clip": 0.01080471, + "auxiliary_loss_mlp": 0.01086684, + "balance_loss_clip": 1.02751279, + "balance_loss_mlp": 1.00595582, + "epoch": 0.4361209643479829, + "flos": 21978767036160.0, + "grad_norm": 1.7672163869967215, + "language_loss": 0.87161946, + "learning_rate": 2.505463642773947e-06, + "loss": 0.89329094, + "num_input_tokens_seen": 78161790, + "step": 3627, + "time_per_iteration": 2.898059368133545 + }, + { + "auxiliary_loss_clip": 0.01114908, + "auxiliary_loss_mlp": 0.00873253, + "balance_loss_clip": 1.02889037, + "balance_loss_mlp": 1.00024509, + "epoch": 0.43624120723862203, + "flos": 17420159151360.0, + "grad_norm": 2.189678921775136, + "language_loss": 0.75237775, + "learning_rate": 2.504709921054146e-06, + "loss": 0.77225947, + "num_input_tokens_seen": 78178605, + "step": 3628, + "time_per_iteration": 2.7335433959960938 + }, + { + "auxiliary_loss_clip": 0.01119149, + "auxiliary_loss_mlp": 0.0108579, + "balance_loss_clip": 1.03003991, + "balance_loss_mlp": 1.00506139, + "epoch": 0.4363614501292611, + "flos": 17895293280000.0, + "grad_norm": 2.0357970914995436, + "language_loss": 0.83955753, + "learning_rate": 2.50395612277025e-06, + "loss": 0.86160696, + "num_input_tokens_seen": 78194460, + "step": 3629, + "time_per_iteration": 3.7517058849334717 + }, + { + "auxiliary_loss_clip": 0.01126097, + "auxiliary_loss_mlp": 0.01085409, + "balance_loss_clip": 1.02979279, + "balance_loss_mlp": 1.00487149, + "epoch": 0.4364816930199002, + "flos": 20302888135680.0, + "grad_norm": 2.282172967379626, + "language_loss": 0.72771609, + "learning_rate": 2.503202248036612e-06, + "loss": 0.74983114, + "num_input_tokens_seen": 78213315, + "step": 3630, + "time_per_iteration": 3.730833053588867 + }, + { + "auxiliary_loss_clip": 0.01143645, + "auxiliary_loss_mlp": 0.01084625, + "balance_loss_clip": 1.03132653, + "balance_loss_mlp": 1.00403953, + "epoch": 0.4366019359105393, + "flos": 24061334699520.0, + "grad_norm": 1.6649847773974427, + "language_loss": 0.73247671, + "learning_rate": 2.5024482969675927e-06, + "loss": 0.75475943, + "num_input_tokens_seen": 78233270, + "step": 3631, + "time_per_iteration": 2.7070059776306152 + }, + { + "auxiliary_loss_clip": 0.01093, + "auxiliary_loss_mlp": 0.01085446, + "balance_loss_clip": 1.03047824, + "balance_loss_mlp": 1.00481319, + "epoch": 0.43672217880117836, + "flos": 21754109422080.0, + "grad_norm": 2.0178275838372652, + "language_loss": 0.84180856, + "learning_rate": 2.501694269677566e-06, + "loss": 0.86359304, + "num_input_tokens_seen": 78251040, + "step": 3632, + "time_per_iteration": 2.8515498638153076 + }, + { + "auxiliary_loss_clip": 0.01133666, + "auxiliary_loss_mlp": 0.01084025, + "balance_loss_clip": 1.02932763, + "balance_loss_mlp": 1.00334394, + "epoch": 0.4368424216918175, + "flos": 18035200753920.0, + "grad_norm": 2.131641479691791, + "language_loss": 0.80851781, + "learning_rate": 2.500940166280918e-06, + "loss": 0.83069468, + "num_input_tokens_seen": 78269470, + "step": 3633, + "time_per_iteration": 2.6392273902893066 + }, + { + "auxiliary_loss_clip": 0.01136041, + "auxiliary_loss_mlp": 0.0108603, + "balance_loss_clip": 1.03159583, + "balance_loss_mlp": 1.0053966, + "epoch": 0.4369626645824566, + "flos": 25447127362560.0, + "grad_norm": 3.1903978998723543, + "language_loss": 0.79280376, + "learning_rate": 2.500185986892045e-06, + "loss": 0.8150245, + "num_input_tokens_seen": 78288955, + "step": 3634, + "time_per_iteration": 2.801906108856201 + }, + { + "auxiliary_loss_clip": 0.01136427, + "auxiliary_loss_mlp": 0.01085232, + "balance_loss_clip": 1.03158212, + "balance_loss_mlp": 1.00445569, + "epoch": 0.43708290747309564, + "flos": 25302694775040.0, + "grad_norm": 2.3084499650187347, + "language_loss": 0.77369082, + "learning_rate": 2.499431731625355e-06, + "loss": 0.79590738, + "num_input_tokens_seen": 78307980, + "step": 3635, + "time_per_iteration": 2.712689161300659 + }, + { + "auxiliary_loss_clip": 0.01144539, + "auxiliary_loss_mlp": 0.01086087, + "balance_loss_clip": 1.03156376, + "balance_loss_mlp": 1.00516725, + "epoch": 0.43720315036373475, + "flos": 31575103344000.0, + "grad_norm": 1.9151760713567174, + "language_loss": 0.79398584, + "learning_rate": 2.4986774005952686e-06, + "loss": 0.81629217, + "num_input_tokens_seen": 78330355, + "step": 3636, + "time_per_iteration": 2.8640246391296387 + }, + { + "auxiliary_loss_clip": 0.01129341, + "auxiliary_loss_mlp": 0.01084533, + "balance_loss_clip": 1.02783775, + "balance_loss_mlp": 1.00394762, + "epoch": 0.43732339325437386, + "flos": 23112000195840.0, + "grad_norm": 1.8933135245770967, + "language_loss": 0.84573585, + "learning_rate": 2.4979229939162166e-06, + "loss": 0.86787462, + "num_input_tokens_seen": 78349135, + "step": 3637, + "time_per_iteration": 2.7410547733306885 + }, + { + "auxiliary_loss_clip": 0.01133939, + "auxiliary_loss_mlp": 0.01086182, + "balance_loss_clip": 1.03130126, + "balance_loss_mlp": 1.00559628, + "epoch": 0.4374436361450129, + "flos": 27746272080000.0, + "grad_norm": 1.9440106015699983, + "language_loss": 0.80779392, + "learning_rate": 2.4971685117026433e-06, + "loss": 0.82999516, + "num_input_tokens_seen": 78368900, + "step": 3638, + "time_per_iteration": 2.7746050357818604 + }, + { + "auxiliary_loss_clip": 0.01137139, + "auxiliary_loss_mlp": 0.01085037, + "balance_loss_clip": 1.03264606, + "balance_loss_mlp": 1.00449955, + "epoch": 0.437563879035652, + "flos": 24172370616960.0, + "grad_norm": 1.487900555614564, + "language_loss": 0.76777709, + "learning_rate": 2.4964139540690018e-06, + "loss": 0.78999889, + "num_input_tokens_seen": 78392235, + "step": 3639, + "time_per_iteration": 2.8139162063598633 + }, + { + "auxiliary_loss_clip": 0.01107449, + "auxiliary_loss_mlp": 0.01085563, + "balance_loss_clip": 1.02697062, + "balance_loss_mlp": 1.00473893, + "epoch": 0.4376841219262911, + "flos": 23477211728640.0, + "grad_norm": 1.8297690305911416, + "language_loss": 0.73074543, + "learning_rate": 2.495659321129758e-06, + "loss": 0.75267559, + "num_input_tokens_seen": 78409980, + "step": 3640, + "time_per_iteration": 2.766374349594116 + }, + { + "auxiliary_loss_clip": 0.01137376, + "auxiliary_loss_mlp": 0.01084474, + "balance_loss_clip": 1.03220916, + "balance_loss_mlp": 1.00384128, + "epoch": 0.4378043648169302, + "flos": 25447809720960.0, + "grad_norm": 2.5268732455104197, + "language_loss": 0.75352788, + "learning_rate": 2.494904612999389e-06, + "loss": 0.77574646, + "num_input_tokens_seen": 78428690, + "step": 3641, + "time_per_iteration": 2.7494351863861084 + }, + { + "auxiliary_loss_clip": 0.01126215, + "auxiliary_loss_mlp": 0.01079863, + "balance_loss_clip": 1.03778338, + "balance_loss_mlp": 1.00051785, + "epoch": 0.4379246077075693, + "flos": 53914056986880.0, + "grad_norm": 0.801758598045067, + "language_loss": 0.56564415, + "learning_rate": 2.4941498297923843e-06, + "loss": 0.58770496, + "num_input_tokens_seen": 78489260, + "step": 3642, + "time_per_iteration": 3.231567859649658 + }, + { + "auxiliary_loss_clip": 0.01133064, + "auxiliary_loss_mlp": 0.01086341, + "balance_loss_clip": 1.03027868, + "balance_loss_mlp": 1.00566006, + "epoch": 0.43804485059820836, + "flos": 20588305605120.0, + "grad_norm": 1.737332167066802, + "language_loss": 0.69659317, + "learning_rate": 2.4933949716232424e-06, + "loss": 0.71878719, + "num_input_tokens_seen": 78506785, + "step": 3643, + "time_per_iteration": 2.6825876235961914 + }, + { + "auxiliary_loss_clip": 0.01095476, + "auxiliary_loss_mlp": 0.01085616, + "balance_loss_clip": 1.02736318, + "balance_loss_mlp": 1.00503111, + "epoch": 0.43816509348884747, + "flos": 23876214981120.0, + "grad_norm": 2.485890521096174, + "language_loss": 0.73997706, + "learning_rate": 2.492640038606476e-06, + "loss": 0.76178801, + "num_input_tokens_seen": 78525150, + "step": 3644, + "time_per_iteration": 2.8283677101135254 + }, + { + "auxiliary_loss_clip": 0.01132751, + "auxiliary_loss_mlp": 0.01085175, + "balance_loss_clip": 1.02938414, + "balance_loss_mlp": 1.00449419, + "epoch": 0.4382853363794866, + "flos": 14684448533760.0, + "grad_norm": 1.9518294055001522, + "language_loss": 0.7828747, + "learning_rate": 2.491885030856608e-06, + "loss": 0.80505395, + "num_input_tokens_seen": 78543245, + "step": 3645, + "time_per_iteration": 2.670452117919922 + }, + { + "auxiliary_loss_clip": 0.01125917, + "auxiliary_loss_mlp": 0.01085517, + "balance_loss_clip": 1.03021145, + "balance_loss_mlp": 1.00478888, + "epoch": 0.43840557927012563, + "flos": 17165301177600.0, + "grad_norm": 2.3094492061993073, + "language_loss": 0.82701421, + "learning_rate": 2.4911299484881713e-06, + "loss": 0.84912848, + "num_input_tokens_seen": 78560775, + "step": 3646, + "time_per_iteration": 3.6411592960357666 + }, + { + "auxiliary_loss_clip": 0.01126228, + "auxiliary_loss_mlp": 0.01086191, + "balance_loss_clip": 1.03011429, + "balance_loss_mlp": 1.0056057, + "epoch": 0.43852582216076474, + "flos": 19390685316480.0, + "grad_norm": 2.0233131948965055, + "language_loss": 0.80990863, + "learning_rate": 2.490374791615712e-06, + "loss": 0.8320328, + "num_input_tokens_seen": 78580800, + "step": 3647, + "time_per_iteration": 2.7737298011779785 + }, + { + "auxiliary_loss_clip": 0.01145315, + "auxiliary_loss_mlp": 0.00873263, + "balance_loss_clip": 1.03228283, + "balance_loss_mlp": 1.00026035, + "epoch": 0.43864606505140386, + "flos": 18075133699200.0, + "grad_norm": 2.553676907405377, + "language_loss": 0.7702527, + "learning_rate": 2.4896195603537867e-06, + "loss": 0.79043847, + "num_input_tokens_seen": 78595410, + "step": 3648, + "time_per_iteration": 2.6858718395233154 + }, + { + "auxiliary_loss_clip": 0.01094877, + "auxiliary_loss_mlp": 0.01086854, + "balance_loss_clip": 1.02587211, + "balance_loss_mlp": 1.00612533, + "epoch": 0.4387663079420429, + "flos": 19644896845440.0, + "grad_norm": 1.9784283941720509, + "language_loss": 0.74092388, + "learning_rate": 2.488864254816964e-06, + "loss": 0.76274121, + "num_input_tokens_seen": 78614100, + "step": 3649, + "time_per_iteration": 2.83308482170105 + }, + { + "auxiliary_loss_clip": 0.01134371, + "auxiliary_loss_mlp": 0.01084911, + "balance_loss_clip": 1.03100324, + "balance_loss_mlp": 1.00413465, + "epoch": 0.438886550832682, + "flos": 19719339782400.0, + "grad_norm": 5.17356360267527, + "language_loss": 0.68219018, + "learning_rate": 2.4881088751198218e-06, + "loss": 0.70438302, + "num_input_tokens_seen": 78632260, + "step": 3650, + "time_per_iteration": 2.7666714191436768 + }, + { + "auxiliary_loss_clip": 0.01124617, + "auxiliary_loss_mlp": 0.01086393, + "balance_loss_clip": 1.02983272, + "balance_loss_mlp": 1.00552106, + "epoch": 0.43900679372332113, + "flos": 14536675981440.0, + "grad_norm": 2.3348442114608976, + "language_loss": 0.64719117, + "learning_rate": 2.4873534213769517e-06, + "loss": 0.66930127, + "num_input_tokens_seen": 78647490, + "step": 3651, + "time_per_iteration": 3.7014286518096924 + }, + { + "auxiliary_loss_clip": 0.01115047, + "auxiliary_loss_mlp": 0.01085306, + "balance_loss_clip": 1.03034532, + "balance_loss_mlp": 1.00452948, + "epoch": 0.4391270366139602, + "flos": 24056234968320.0, + "grad_norm": 2.405181540947128, + "language_loss": 0.71956277, + "learning_rate": 2.4865978937029547e-06, + "loss": 0.74156624, + "num_input_tokens_seen": 78666470, + "step": 3652, + "time_per_iteration": 2.9125659465789795 + }, + { + "auxiliary_loss_clip": 0.01104701, + "auxiliary_loss_mlp": 0.01085297, + "balance_loss_clip": 1.02844298, + "balance_loss_mlp": 1.00471163, + "epoch": 0.4392472795045993, + "flos": 31538510363520.0, + "grad_norm": 1.6563096925174765, + "language_loss": 0.66172838, + "learning_rate": 2.485842292212445e-06, + "loss": 0.68362838, + "num_input_tokens_seen": 78687685, + "step": 3653, + "time_per_iteration": 2.9637045860290527 + }, + { + "auxiliary_loss_clip": 0.01145871, + "auxiliary_loss_mlp": 0.01085871, + "balance_loss_clip": 1.03280354, + "balance_loss_mlp": 1.0051899, + "epoch": 0.4393675223952384, + "flos": 14866300114560.0, + "grad_norm": 1.6418713123106043, + "language_loss": 0.80252242, + "learning_rate": 2.485086617020045e-06, + "loss": 0.82483989, + "num_input_tokens_seen": 78706180, + "step": 3654, + "time_per_iteration": 2.6294784545898438 + }, + { + "auxiliary_loss_clip": 0.01124341, + "auxiliary_loss_mlp": 0.01084729, + "balance_loss_clip": 1.0293442, + "balance_loss_mlp": 1.00395274, + "epoch": 0.43948776528587746, + "flos": 14825900292480.0, + "grad_norm": 2.184546074878199, + "language_loss": 0.8166945, + "learning_rate": 2.4843308682403903e-06, + "loss": 0.83878517, + "num_input_tokens_seen": 78723095, + "step": 3655, + "time_per_iteration": 3.685927152633667 + }, + { + "auxiliary_loss_clip": 0.01143306, + "auxiliary_loss_mlp": 0.01086473, + "balance_loss_clip": 1.03050709, + "balance_loss_mlp": 1.00569642, + "epoch": 0.4396080081765166, + "flos": 13914523486080.0, + "grad_norm": 1.6663304119127567, + "language_loss": 0.82823956, + "learning_rate": 2.4835750459881294e-06, + "loss": 0.8505373, + "num_input_tokens_seen": 78739720, + "step": 3656, + "time_per_iteration": 2.6525819301605225 + }, + { + "auxiliary_loss_clip": 0.01128841, + "auxiliary_loss_mlp": 0.01087648, + "balance_loss_clip": 1.03194165, + "balance_loss_mlp": 1.00691986, + "epoch": 0.43972825106715563, + "flos": 18222978078720.0, + "grad_norm": 1.683166334867416, + "language_loss": 0.82017589, + "learning_rate": 2.4828191503779177e-06, + "loss": 0.84234077, + "num_input_tokens_seen": 78757820, + "step": 3657, + "time_per_iteration": 3.720975399017334 + }, + { + "auxiliary_loss_clip": 0.01106624, + "auxiliary_loss_mlp": 0.01085924, + "balance_loss_clip": 1.02609956, + "balance_loss_mlp": 1.00519598, + "epoch": 0.43984849395779474, + "flos": 16873239692160.0, + "grad_norm": 1.8599234399964104, + "language_loss": 0.8965022, + "learning_rate": 2.482063181524425e-06, + "loss": 0.91842765, + "num_input_tokens_seen": 78773720, + "step": 3658, + "time_per_iteration": 2.7553060054779053 + }, + { + "auxiliary_loss_clip": 0.01144364, + "auxiliary_loss_mlp": 0.01088431, + "balance_loss_clip": 1.03184581, + "balance_loss_mlp": 1.00741673, + "epoch": 0.43996873684843385, + "flos": 18691504104960.0, + "grad_norm": 2.6307091480458866, + "language_loss": 0.81339967, + "learning_rate": 2.4813071395423307e-06, + "loss": 0.83572763, + "num_input_tokens_seen": 78791285, + "step": 3659, + "time_per_iteration": 2.5979855060577393 + }, + { + "auxiliary_loss_clip": 0.01133905, + "auxiliary_loss_mlp": 0.01087119, + "balance_loss_clip": 1.02982032, + "balance_loss_mlp": 1.00639033, + "epoch": 0.4400889797390729, + "flos": 23653460787840.0, + "grad_norm": 3.846025109768023, + "language_loss": 0.64528644, + "learning_rate": 2.4805510245463263e-06, + "loss": 0.66749668, + "num_input_tokens_seen": 78811440, + "step": 3660, + "time_per_iteration": 2.78057599067688 + }, + { + "auxiliary_loss_clip": 0.01136084, + "auxiliary_loss_mlp": 0.01085038, + "balance_loss_clip": 1.03136969, + "balance_loss_mlp": 1.00430989, + "epoch": 0.440209222629712, + "flos": 23149203707520.0, + "grad_norm": 2.1187922917699438, + "language_loss": 0.60570025, + "learning_rate": 2.4797948366511137e-06, + "loss": 0.62791145, + "num_input_tokens_seen": 78831150, + "step": 3661, + "time_per_iteration": 2.744004726409912 + }, + { + "auxiliary_loss_clip": 0.01116023, + "auxiliary_loss_mlp": 0.01085406, + "balance_loss_clip": 1.02954865, + "balance_loss_mlp": 1.00462961, + "epoch": 0.4403294655203511, + "flos": 24823394668800.0, + "grad_norm": 1.9876113745918849, + "language_loss": 0.76083231, + "learning_rate": 2.4790385759714055e-06, + "loss": 0.78284657, + "num_input_tokens_seen": 78850215, + "step": 3662, + "time_per_iteration": 2.8541476726531982 + }, + { + "auxiliary_loss_clip": 0.0113097, + "auxiliary_loss_mlp": 0.01085781, + "balance_loss_clip": 1.02885151, + "balance_loss_mlp": 1.00500512, + "epoch": 0.4404497084109902, + "flos": 22565080736640.0, + "grad_norm": 1.9156797949128057, + "language_loss": 0.71609402, + "learning_rate": 2.478282242621926e-06, + "loss": 0.73826146, + "num_input_tokens_seen": 78870675, + "step": 3663, + "time_per_iteration": 2.7827155590057373 + }, + { + "auxiliary_loss_clip": 0.01112487, + "auxiliary_loss_mlp": 0.01079446, + "balance_loss_clip": 1.03959215, + "balance_loss_mlp": 1.00010037, + "epoch": 0.4405699513016293, + "flos": 64967073448320.0, + "grad_norm": 0.8413221635058415, + "language_loss": 0.59534305, + "learning_rate": 2.477525836717411e-06, + "loss": 0.61726236, + "num_input_tokens_seen": 78938440, + "step": 3664, + "time_per_iteration": 3.431544303894043 + }, + { + "auxiliary_loss_clip": 0.01133912, + "auxiliary_loss_mlp": 0.01085481, + "balance_loss_clip": 1.0305841, + "balance_loss_mlp": 1.00470471, + "epoch": 0.4406901941922684, + "flos": 35661952978560.0, + "grad_norm": 2.147912462234799, + "language_loss": 0.79761755, + "learning_rate": 2.476769358372606e-06, + "loss": 0.81981146, + "num_input_tokens_seen": 78960090, + "step": 3665, + "time_per_iteration": 2.8932371139526367 + }, + { + "auxiliary_loss_clip": 0.01109453, + "auxiliary_loss_mlp": 0.0108562, + "balance_loss_clip": 1.02958906, + "balance_loss_mlp": 1.004987, + "epoch": 0.44081043708290746, + "flos": 18040767361920.0, + "grad_norm": 2.481003735509708, + "language_loss": 0.75262332, + "learning_rate": 2.4760128077022683e-06, + "loss": 0.77457404, + "num_input_tokens_seen": 78978225, + "step": 3666, + "time_per_iteration": 2.800661563873291 + }, + { + "auxiliary_loss_clip": 0.01083592, + "auxiliary_loss_mlp": 0.01085086, + "balance_loss_clip": 1.02730107, + "balance_loss_mlp": 1.00435734, + "epoch": 0.44093067997354657, + "flos": 30153507799680.0, + "grad_norm": 1.6167759566903377, + "language_loss": 0.68356305, + "learning_rate": 2.4752561848211672e-06, + "loss": 0.70524985, + "num_input_tokens_seen": 79000625, + "step": 3667, + "time_per_iteration": 2.9540011882781982 + }, + { + "auxiliary_loss_clip": 0.01136546, + "auxiliary_loss_mlp": 0.01086732, + "balance_loss_clip": 1.03358984, + "balance_loss_mlp": 1.00609827, + "epoch": 0.4410509228641857, + "flos": 23255068066560.0, + "grad_norm": 1.8116635768640135, + "language_loss": 0.71461058, + "learning_rate": 2.4744994898440797e-06, + "loss": 0.73684335, + "num_input_tokens_seen": 79019415, + "step": 3668, + "time_per_iteration": 2.7840263843536377 + }, + { + "auxiliary_loss_clip": 0.01116525, + "auxiliary_loss_mlp": 0.01085546, + "balance_loss_clip": 1.02961695, + "balance_loss_mlp": 1.00486493, + "epoch": 0.44117116575482473, + "flos": 19500571998720.0, + "grad_norm": 1.8771508524054898, + "language_loss": 0.83383048, + "learning_rate": 2.473742722885797e-06, + "loss": 0.85585117, + "num_input_tokens_seen": 79038435, + "step": 3669, + "time_per_iteration": 2.841949224472046 + }, + { + "auxiliary_loss_clip": 0.0113069, + "auxiliary_loss_mlp": 0.00873167, + "balance_loss_clip": 1.02881265, + "balance_loss_mlp": 1.00020671, + "epoch": 0.44129140864546385, + "flos": 27053124353280.0, + "grad_norm": 2.020526559721073, + "language_loss": 0.65121102, + "learning_rate": 2.4729858840611197e-06, + "loss": 0.67124963, + "num_input_tokens_seen": 79057345, + "step": 3670, + "time_per_iteration": 2.731250762939453 + }, + { + "auxiliary_loss_clip": 0.01145243, + "auxiliary_loss_mlp": 0.01085083, + "balance_loss_clip": 1.03280282, + "balance_loss_mlp": 1.0044024, + "epoch": 0.4414116515361029, + "flos": 26102101910400.0, + "grad_norm": 1.85339727778406, + "language_loss": 0.72399092, + "learning_rate": 2.4722289734848605e-06, + "loss": 0.7462942, + "num_input_tokens_seen": 79077810, + "step": 3671, + "time_per_iteration": 3.51842999458313 + }, + { + "auxiliary_loss_clip": 0.01110881, + "auxiliary_loss_mlp": 0.01086119, + "balance_loss_clip": 1.02645051, + "balance_loss_mlp": 1.0055815, + "epoch": 0.441531894426742, + "flos": 21906083865600.0, + "grad_norm": 1.9503717222143628, + "language_loss": 0.77546239, + "learning_rate": 2.471471991271841e-06, + "loss": 0.79743236, + "num_input_tokens_seen": 79094935, + "step": 3672, + "time_per_iteration": 2.8013949394226074 + }, + { + "auxiliary_loss_clip": 0.01135753, + "auxiliary_loss_mlp": 0.01086581, + "balance_loss_clip": 1.03120899, + "balance_loss_mlp": 1.00585222, + "epoch": 0.4416521373173811, + "flos": 23437099215360.0, + "grad_norm": 1.874979206282978, + "language_loss": 0.7927475, + "learning_rate": 2.470714937536896e-06, + "loss": 0.81497085, + "num_input_tokens_seen": 79113660, + "step": 3673, + "time_per_iteration": 2.695420265197754 + }, + { + "auxiliary_loss_clip": 0.01102066, + "auxiliary_loss_mlp": 0.0108722, + "balance_loss_clip": 1.02589083, + "balance_loss_mlp": 1.00649154, + "epoch": 0.4417723802080202, + "flos": 20334345471360.0, + "grad_norm": 1.9365720436356972, + "language_loss": 0.70445329, + "learning_rate": 2.469957812394868e-06, + "loss": 0.72634614, + "num_input_tokens_seen": 79132470, + "step": 3674, + "time_per_iteration": 2.7795870304107666 + }, + { + "auxiliary_loss_clip": 0.01145801, + "auxiliary_loss_mlp": 0.01085841, + "balance_loss_clip": 1.03317571, + "balance_loss_mlp": 1.00516045, + "epoch": 0.4418926230986593, + "flos": 18880682060160.0, + "grad_norm": 1.8994609614121811, + "language_loss": 0.7629509, + "learning_rate": 2.4692006159606148e-06, + "loss": 0.78526729, + "num_input_tokens_seen": 79150000, + "step": 3675, + "time_per_iteration": 2.6529531478881836 + }, + { + "auxiliary_loss_clip": 0.01145143, + "auxiliary_loss_mlp": 0.01086158, + "balance_loss_clip": 1.03215241, + "balance_loss_mlp": 1.00547719, + "epoch": 0.4420128659892984, + "flos": 19464409981440.0, + "grad_norm": 1.8212114648365136, + "language_loss": 0.78362817, + "learning_rate": 2.468443348349e-06, + "loss": 0.80594116, + "num_input_tokens_seen": 79167875, + "step": 3676, + "time_per_iteration": 2.6195878982543945 + }, + { + "auxiliary_loss_clip": 0.01109977, + "auxiliary_loss_mlp": 0.01083888, + "balance_loss_clip": 1.03115284, + "balance_loss_mlp": 1.00311172, + "epoch": 0.44213310887993745, + "flos": 17894359526400.0, + "grad_norm": 2.701811827452748, + "language_loss": 0.82329094, + "learning_rate": 2.467686009674902e-06, + "loss": 0.84522957, + "num_input_tokens_seen": 79182325, + "step": 3677, + "time_per_iteration": 3.8202927112579346 + }, + { + "auxiliary_loss_clip": 0.01136531, + "auxiliary_loss_mlp": 0.01086115, + "balance_loss_clip": 1.03164172, + "balance_loss_mlp": 1.00533926, + "epoch": 0.44225335177057656, + "flos": 19204667758080.0, + "grad_norm": 2.098390416703836, + "language_loss": 0.85138708, + "learning_rate": 2.466928600053209e-06, + "loss": 0.8736136, + "num_input_tokens_seen": 79197630, + "step": 3678, + "time_per_iteration": 2.6598453521728516 + }, + { + "auxiliary_loss_clip": 0.01121661, + "auxiliary_loss_mlp": 0.01085479, + "balance_loss_clip": 1.02791917, + "balance_loss_mlp": 1.00484633, + "epoch": 0.4423735946612157, + "flos": 23471321898240.0, + "grad_norm": 1.724860701079712, + "language_loss": 0.71106178, + "learning_rate": 2.466171119598818e-06, + "loss": 0.7331332, + "num_input_tokens_seen": 79217600, + "step": 3679, + "time_per_iteration": 3.8163235187530518 + }, + { + "auxiliary_loss_clip": 0.01136508, + "auxiliary_loss_mlp": 0.01085827, + "balance_loss_clip": 1.03114545, + "balance_loss_mlp": 1.00505137, + "epoch": 0.44249383755185473, + "flos": 26685398868480.0, + "grad_norm": 2.609424855724555, + "language_loss": 0.77263594, + "learning_rate": 2.465413568426639e-06, + "loss": 0.79485935, + "num_input_tokens_seen": 79238550, + "step": 3680, + "time_per_iteration": 2.7217657566070557 + }, + { + "auxiliary_loss_clip": 0.0113467, + "auxiliary_loss_mlp": 0.01085404, + "balance_loss_clip": 1.03129005, + "balance_loss_mlp": 1.00486612, + "epoch": 0.44261408044249384, + "flos": 23147659422720.0, + "grad_norm": 1.5526831122688105, + "language_loss": 0.80832595, + "learning_rate": 2.464655946651591e-06, + "loss": 0.83052665, + "num_input_tokens_seen": 79257555, + "step": 3681, + "time_per_iteration": 2.7066328525543213 + }, + { + "auxiliary_loss_clip": 0.01135756, + "auxiliary_loss_mlp": 0.01086355, + "balance_loss_clip": 1.03138638, + "balance_loss_mlp": 1.00567412, + "epoch": 0.44273432333313295, + "flos": 24462564595200.0, + "grad_norm": 1.8803279506148298, + "language_loss": 0.81151748, + "learning_rate": 2.4638982543886065e-06, + "loss": 0.83373857, + "num_input_tokens_seen": 79277595, + "step": 3682, + "time_per_iteration": 3.693143606185913 + }, + { + "auxiliary_loss_clip": 0.01137012, + "auxiliary_loss_mlp": 0.01086052, + "balance_loss_clip": 1.0328182, + "balance_loss_mlp": 1.00532305, + "epoch": 0.442854566223772, + "flos": 17528932512000.0, + "grad_norm": 2.320319440196322, + "language_loss": 0.87004042, + "learning_rate": 2.4631404917526254e-06, + "loss": 0.89227104, + "num_input_tokens_seen": 79294550, + "step": 3683, + "time_per_iteration": 2.6815643310546875 + }, + { + "auxiliary_loss_clip": 0.01136987, + "auxiliary_loss_mlp": 0.0108626, + "balance_loss_clip": 1.0322516, + "balance_loss_mlp": 1.00562716, + "epoch": 0.4429748091144111, + "flos": 24896293320960.0, + "grad_norm": 1.453250442085366, + "language_loss": 0.79260159, + "learning_rate": 2.4623826588586e-06, + "loss": 0.81483406, + "num_input_tokens_seen": 79314820, + "step": 3684, + "time_per_iteration": 2.7402589321136475 + }, + { + "auxiliary_loss_clip": 0.01126663, + "auxiliary_loss_mlp": 0.01086955, + "balance_loss_clip": 1.02985966, + "balance_loss_mlp": 1.0061785, + "epoch": 0.4430950520050502, + "flos": 21614704738560.0, + "grad_norm": 1.501005398511098, + "language_loss": 0.82896876, + "learning_rate": 2.461624755821492e-06, + "loss": 0.85110492, + "num_input_tokens_seen": 79334300, + "step": 3685, + "time_per_iteration": 2.7819764614105225 + }, + { + "auxiliary_loss_clip": 0.01101792, + "auxiliary_loss_mlp": 0.0108487, + "balance_loss_clip": 1.03182685, + "balance_loss_mlp": 1.00428438, + "epoch": 0.4432152948956893, + "flos": 24572271709440.0, + "grad_norm": 1.8929648540469162, + "language_loss": 0.76466137, + "learning_rate": 2.4608667827562763e-06, + "loss": 0.78652799, + "num_input_tokens_seen": 79353630, + "step": 3686, + "time_per_iteration": 2.809736728668213 + }, + { + "auxiliary_loss_clip": 0.01136861, + "auxiliary_loss_mlp": 0.01087768, + "balance_loss_clip": 1.0320797, + "balance_loss_mlp": 1.00694442, + "epoch": 0.4433355377863284, + "flos": 21762261809280.0, + "grad_norm": 1.9806613099518084, + "language_loss": 0.89696598, + "learning_rate": 2.460108739777936e-06, + "loss": 0.91921234, + "num_input_tokens_seen": 79372765, + "step": 3687, + "time_per_iteration": 2.741464376449585 + }, + { + "auxiliary_loss_clip": 0.01118489, + "auxiliary_loss_mlp": 0.01085644, + "balance_loss_clip": 1.02876568, + "balance_loss_mlp": 1.0049634, + "epoch": 0.44345578067696745, + "flos": 20084479488000.0, + "grad_norm": 1.54699830267325, + "language_loss": 0.7641201, + "learning_rate": 2.4593506270014656e-06, + "loss": 0.78616142, + "num_input_tokens_seen": 79391735, + "step": 3688, + "time_per_iteration": 2.7032084465026855 + }, + { + "auxiliary_loss_clip": 0.01125402, + "auxiliary_loss_mlp": 0.01084981, + "balance_loss_clip": 1.02943635, + "balance_loss_mlp": 1.00425303, + "epoch": 0.44357602356760656, + "flos": 24169497528960.0, + "grad_norm": 1.5626831424801868, + "language_loss": 0.82037735, + "learning_rate": 2.45859244454187e-06, + "loss": 0.84248114, + "num_input_tokens_seen": 79411525, + "step": 3689, + "time_per_iteration": 2.8266351222991943 + }, + { + "auxiliary_loss_clip": 0.01134817, + "auxiliary_loss_mlp": 0.01086187, + "balance_loss_clip": 1.03204775, + "balance_loss_mlp": 1.00560141, + "epoch": 0.44369626645824567, + "flos": 22707717644160.0, + "grad_norm": 1.6598955657240182, + "language_loss": 0.65962076, + "learning_rate": 2.4578341925141655e-06, + "loss": 0.68183082, + "num_input_tokens_seen": 79430740, + "step": 3690, + "time_per_iteration": 2.7093498706817627 + }, + { + "auxiliary_loss_clip": 0.01138071, + "auxiliary_loss_mlp": 0.01086185, + "balance_loss_clip": 1.03258479, + "balance_loss_mlp": 1.00545645, + "epoch": 0.4438165093488847, + "flos": 38030225420160.0, + "grad_norm": 1.8079956806566078, + "language_loss": 0.72680902, + "learning_rate": 2.457075871033378e-06, + "loss": 0.74905169, + "num_input_tokens_seen": 79452615, + "step": 3691, + "time_per_iteration": 2.9265308380126953 + }, + { + "auxiliary_loss_clip": 0.01115629, + "auxiliary_loss_mlp": 0.01086047, + "balance_loss_clip": 1.02922535, + "balance_loss_mlp": 1.00541425, + "epoch": 0.44393675223952384, + "flos": 15523213996800.0, + "grad_norm": 2.0325841413364865, + "language_loss": 0.88569462, + "learning_rate": 2.4563174802145445e-06, + "loss": 0.90771139, + "num_input_tokens_seen": 79469865, + "step": 3692, + "time_per_iteration": 2.870121479034424 + }, + { + "auxiliary_loss_clip": 0.01122042, + "auxiliary_loss_mlp": 0.01080391, + "balance_loss_clip": 1.04116583, + "balance_loss_mlp": 1.00104487, + "epoch": 0.44405699513016295, + "flos": 64574893779840.0, + "grad_norm": 0.6514775988700271, + "language_loss": 0.48563039, + "learning_rate": 2.455559020172712e-06, + "loss": 0.50765467, + "num_input_tokens_seen": 79537220, + "step": 3693, + "time_per_iteration": 3.4373888969421387 + }, + { + "auxiliary_loss_clip": 0.01106532, + "auxiliary_loss_mlp": 0.01086439, + "balance_loss_clip": 1.02973795, + "balance_loss_mlp": 1.0057106, + "epoch": 0.444177238020802, + "flos": 23987394552960.0, + "grad_norm": 1.934514433461544, + "language_loss": 0.89581704, + "learning_rate": 2.4548004910229385e-06, + "loss": 0.91774672, + "num_input_tokens_seen": 79554795, + "step": 3694, + "time_per_iteration": 2.8676021099090576 + }, + { + "auxiliary_loss_clip": 0.01136242, + "auxiliary_loss_mlp": 0.00873068, + "balance_loss_clip": 1.03188801, + "balance_loss_mlp": 1.00021529, + "epoch": 0.4442974809114411, + "flos": 22563069575040.0, + "grad_norm": 1.757674156775397, + "language_loss": 0.86712599, + "learning_rate": 2.4540418928802913e-06, + "loss": 0.88721919, + "num_input_tokens_seen": 79573530, + "step": 3695, + "time_per_iteration": 2.7621638774871826 + }, + { + "auxiliary_loss_clip": 0.01127643, + "auxiliary_loss_mlp": 0.01087794, + "balance_loss_clip": 1.03120184, + "balance_loss_mlp": 1.00701737, + "epoch": 0.4444177238020802, + "flos": 17675699483520.0, + "grad_norm": 1.96167690847421, + "language_loss": 0.66203094, + "learning_rate": 2.4532832258598506e-06, + "loss": 0.68418527, + "num_input_tokens_seen": 79591360, + "step": 3696, + "time_per_iteration": 2.756829261779785 + }, + { + "auxiliary_loss_clip": 0.01144696, + "auxiliary_loss_mlp": 0.01086028, + "balance_loss_clip": 1.03204274, + "balance_loss_mlp": 1.00544214, + "epoch": 0.4445379666927193, + "flos": 28621594609920.0, + "grad_norm": 2.0840101293650024, + "language_loss": 0.80601275, + "learning_rate": 2.4525244900767047e-06, + "loss": 0.82832003, + "num_input_tokens_seen": 79612175, + "step": 3697, + "time_per_iteration": 3.6577680110931396 + }, + { + "auxiliary_loss_clip": 0.01127416, + "auxiliary_loss_mlp": 0.01079887, + "balance_loss_clip": 1.03840208, + "balance_loss_mlp": 1.00054133, + "epoch": 0.4446582095833584, + "flos": 70487370115200.0, + "grad_norm": 0.7703317550530575, + "language_loss": 0.60504019, + "learning_rate": 2.4517656856459536e-06, + "loss": 0.62711322, + "num_input_tokens_seen": 79678020, + "step": 3698, + "time_per_iteration": 3.374417781829834 + }, + { + "auxiliary_loss_clip": 0.01133651, + "auxiliary_loss_mlp": 0.01085516, + "balance_loss_clip": 1.02957737, + "balance_loss_mlp": 1.00478792, + "epoch": 0.4447784524739975, + "flos": 26505199313280.0, + "grad_norm": 1.7406898631796948, + "language_loss": 0.6809833, + "learning_rate": 2.4510068126827073e-06, + "loss": 0.70317501, + "num_input_tokens_seen": 79699020, + "step": 3699, + "time_per_iteration": 2.7286605834960938 + }, + { + "auxiliary_loss_clip": 0.01126491, + "auxiliary_loss_mlp": 0.01084691, + "balance_loss_clip": 1.03102887, + "balance_loss_mlp": 1.00400984, + "epoch": 0.44489869536463655, + "flos": 11656209553920.0, + "grad_norm": 2.123356466549024, + "language_loss": 0.81597745, + "learning_rate": 2.450247871302086e-06, + "loss": 0.83808929, + "num_input_tokens_seen": 79716795, + "step": 3700, + "time_per_iteration": 2.6785788536071777 + }, + { + "auxiliary_loss_clip": 0.01121181, + "auxiliary_loss_mlp": 0.01085193, + "balance_loss_clip": 1.03190601, + "balance_loss_mlp": 1.0045594, + "epoch": 0.44501893825527566, + "flos": 20448469958400.0, + "grad_norm": 2.632034017362276, + "language_loss": 0.83484495, + "learning_rate": 2.44948886161922e-06, + "loss": 0.85690868, + "num_input_tokens_seen": 79735810, + "step": 3701, + "time_per_iteration": 2.720705986022949 + }, + { + "auxiliary_loss_clip": 0.01135317, + "auxiliary_loss_mlp": 0.01084325, + "balance_loss_clip": 1.03069234, + "balance_loss_mlp": 1.003739, + "epoch": 0.4451391811459148, + "flos": 18261079430400.0, + "grad_norm": 1.4893411829011194, + "language_loss": 0.84917426, + "learning_rate": 2.4487297837492524e-06, + "loss": 0.87137067, + "num_input_tokens_seen": 79754975, + "step": 3702, + "time_per_iteration": 3.6231703758239746 + }, + { + "auxiliary_loss_clip": 0.01106224, + "auxiliary_loss_mlp": 0.01086626, + "balance_loss_clip": 1.02636516, + "balance_loss_mlp": 1.00589776, + "epoch": 0.44525942403655383, + "flos": 16910155895040.0, + "grad_norm": 1.9952533986622907, + "language_loss": 0.62050408, + "learning_rate": 2.4479706378073323e-06, + "loss": 0.64243257, + "num_input_tokens_seen": 79773515, + "step": 3703, + "time_per_iteration": 2.741580009460449 + }, + { + "auxiliary_loss_clip": 0.01117991, + "auxiliary_loss_mlp": 0.01084616, + "balance_loss_clip": 1.02981591, + "balance_loss_mlp": 1.00403023, + "epoch": 0.44537966692719294, + "flos": 23258838994560.0, + "grad_norm": 1.5066956666466464, + "language_loss": 0.83837378, + "learning_rate": 2.447211423908623e-06, + "loss": 0.86039984, + "num_input_tokens_seen": 79793560, + "step": 3704, + "time_per_iteration": 2.82124924659729 + }, + { + "auxiliary_loss_clip": 0.0113597, + "auxiliary_loss_mlp": 0.01085445, + "balance_loss_clip": 1.03129876, + "balance_loss_mlp": 1.00481188, + "epoch": 0.445499909817832, + "flos": 21724160457600.0, + "grad_norm": 1.9179033239503556, + "language_loss": 0.74921608, + "learning_rate": 2.4464521421682966e-06, + "loss": 0.77143025, + "num_input_tokens_seen": 79811150, + "step": 3705, + "time_per_iteration": 3.7030396461486816 + }, + { + "auxiliary_loss_clip": 0.01134991, + "auxiliary_loss_mlp": 0.01085191, + "balance_loss_clip": 1.0321691, + "balance_loss_mlp": 1.00465298, + "epoch": 0.4456201527084711, + "flos": 23987969170560.0, + "grad_norm": 1.313530876897494, + "language_loss": 0.87560928, + "learning_rate": 2.4456927927015345e-06, + "loss": 0.89781106, + "num_input_tokens_seen": 79832190, + "step": 3706, + "time_per_iteration": 2.7181529998779297 + }, + { + "auxiliary_loss_clip": 0.01126731, + "auxiliary_loss_mlp": 0.01086171, + "balance_loss_clip": 1.03087854, + "balance_loss_mlp": 1.00539482, + "epoch": 0.4457403955991102, + "flos": 18807065136000.0, + "grad_norm": 2.685129379702711, + "language_loss": 0.76414847, + "learning_rate": 2.4449333756235307e-06, + "loss": 0.78627753, + "num_input_tokens_seen": 79848905, + "step": 3707, + "time_per_iteration": 2.781151056289673 + }, + { + "auxiliary_loss_clip": 0.01122368, + "auxiliary_loss_mlp": 0.01086965, + "balance_loss_clip": 1.03324437, + "balance_loss_mlp": 1.00618935, + "epoch": 0.4458606384897493, + "flos": 19207756327680.0, + "grad_norm": 2.266635016135338, + "language_loss": 0.78614295, + "learning_rate": 2.4441738910494876e-06, + "loss": 0.8082363, + "num_input_tokens_seen": 79863640, + "step": 3708, + "time_per_iteration": 3.5263986587524414 + }, + { + "auxiliary_loss_clip": 0.01127766, + "auxiliary_loss_mlp": 0.01085299, + "balance_loss_clip": 1.03133583, + "balance_loss_mlp": 1.00452232, + "epoch": 0.4459808813803884, + "flos": 21361283308800.0, + "grad_norm": 1.7458809258387211, + "language_loss": 0.8227132, + "learning_rate": 2.4434143390946176e-06, + "loss": 0.8448438, + "num_input_tokens_seen": 79882450, + "step": 3709, + "time_per_iteration": 2.7987897396087646 + }, + { + "auxiliary_loss_clip": 0.0111625, + "auxiliary_loss_mlp": 0.01085113, + "balance_loss_clip": 1.03063762, + "balance_loss_mlp": 1.00443196, + "epoch": 0.4461011242710275, + "flos": 23288967527040.0, + "grad_norm": 1.9029689320415553, + "language_loss": 0.85485005, + "learning_rate": 2.4426547198741457e-06, + "loss": 0.87686372, + "num_input_tokens_seen": 79900655, + "step": 3710, + "time_per_iteration": 2.807237148284912 + }, + { + "auxiliary_loss_clip": 0.01095728, + "auxiliary_loss_mlp": 0.01086091, + "balance_loss_clip": 1.02496576, + "balance_loss_mlp": 1.00540972, + "epoch": 0.44622136716166655, + "flos": 20193001453440.0, + "grad_norm": 2.152321137013609, + "language_loss": 0.74679357, + "learning_rate": 2.441895033503305e-06, + "loss": 0.76861173, + "num_input_tokens_seen": 79918575, + "step": 3711, + "time_per_iteration": 2.829364776611328 + }, + { + "auxiliary_loss_clip": 0.0113611, + "auxiliary_loss_mlp": 0.01085696, + "balance_loss_clip": 1.03274357, + "balance_loss_mlp": 1.00472856, + "epoch": 0.44634161005230566, + "flos": 21283033530240.0, + "grad_norm": 1.654088887007401, + "language_loss": 0.81751871, + "learning_rate": 2.4411352800973375e-06, + "loss": 0.83973676, + "num_input_tokens_seen": 79937010, + "step": 3712, + "time_per_iteration": 2.742612838745117 + }, + { + "auxiliary_loss_clip": 0.01117158, + "auxiliary_loss_mlp": 0.01087062, + "balance_loss_clip": 1.03008413, + "balance_loss_mlp": 1.00628614, + "epoch": 0.44646185294294477, + "flos": 22929358515840.0, + "grad_norm": 2.277752466203071, + "language_loss": 0.75375295, + "learning_rate": 2.4403754597715005e-06, + "loss": 0.77579522, + "num_input_tokens_seen": 79956455, + "step": 3713, + "time_per_iteration": 2.770914316177368 + }, + { + "auxiliary_loss_clip": 0.01126638, + "auxiliary_loss_mlp": 0.01085058, + "balance_loss_clip": 1.03026772, + "balance_loss_mlp": 1.004282, + "epoch": 0.4465820958335838, + "flos": 22637692080000.0, + "grad_norm": 2.0024867311647654, + "language_loss": 0.92490005, + "learning_rate": 2.4396155726410553e-06, + "loss": 0.94701707, + "num_input_tokens_seen": 79975065, + "step": 3714, + "time_per_iteration": 2.7718372344970703 + }, + { + "auxiliary_loss_clip": 0.01121015, + "auxiliary_loss_mlp": 0.01086107, + "balance_loss_clip": 1.03157282, + "balance_loss_mlp": 1.00533128, + "epoch": 0.44670233872422294, + "flos": 22672525294080.0, + "grad_norm": 2.4582508137864316, + "language_loss": 0.90979868, + "learning_rate": 2.438855618821278e-06, + "loss": 0.93186992, + "num_input_tokens_seen": 79990865, + "step": 3715, + "time_per_iteration": 2.6754982471466064 + }, + { + "auxiliary_loss_clip": 0.01135623, + "auxiliary_loss_mlp": 0.01084389, + "balance_loss_clip": 1.03081703, + "balance_loss_mlp": 1.00370824, + "epoch": 0.44682258161486205, + "flos": 23582178247680.0, + "grad_norm": 1.5507439527172524, + "language_loss": 0.67162472, + "learning_rate": 2.4380955984274517e-06, + "loss": 0.69382483, + "num_input_tokens_seen": 80009520, + "step": 3716, + "time_per_iteration": 2.7471816539764404 + }, + { + "auxiliary_loss_clip": 0.01135555, + "auxiliary_loss_mlp": 0.01087188, + "balance_loss_clip": 1.03038919, + "balance_loss_mlp": 1.00641167, + "epoch": 0.4469428245055011, + "flos": 26501356558080.0, + "grad_norm": 1.8173234648349603, + "language_loss": 0.76688612, + "learning_rate": 2.4373355115748716e-06, + "loss": 0.78911358, + "num_input_tokens_seen": 80030350, + "step": 3717, + "time_per_iteration": 2.6973681449890137 + }, + { + "auxiliary_loss_clip": 0.01126, + "auxiliary_loss_mlp": 0.01085418, + "balance_loss_clip": 1.03075743, + "balance_loss_mlp": 1.00469005, + "epoch": 0.4470630673961402, + "flos": 21504925797120.0, + "grad_norm": 1.7414219885593218, + "language_loss": 0.71823299, + "learning_rate": 2.436575358378842e-06, + "loss": 0.74034715, + "num_input_tokens_seen": 80049840, + "step": 3718, + "time_per_iteration": 2.781250238418579 + }, + { + "auxiliary_loss_clip": 0.01126421, + "auxiliary_loss_mlp": 0.0108666, + "balance_loss_clip": 1.03082657, + "balance_loss_mlp": 1.00588346, + "epoch": 0.44718331028677927, + "flos": 16173986653440.0, + "grad_norm": 2.669739722746967, + "language_loss": 0.82464975, + "learning_rate": 2.4358151389546782e-06, + "loss": 0.84678054, + "num_input_tokens_seen": 80066525, + "step": 3719, + "time_per_iteration": 2.740854024887085 + }, + { + "auxiliary_loss_clip": 0.01145684, + "auxiliary_loss_mlp": 0.01087215, + "balance_loss_clip": 1.03242242, + "balance_loss_mlp": 1.00648665, + "epoch": 0.4473035531774184, + "flos": 19681238430720.0, + "grad_norm": 2.2305911608872915, + "language_loss": 0.76057315, + "learning_rate": 2.4350548534177035e-06, + "loss": 0.78290212, + "num_input_tokens_seen": 80083355, + "step": 3720, + "time_per_iteration": 2.6648614406585693 + }, + { + "auxiliary_loss_clip": 0.01115949, + "auxiliary_loss_mlp": 0.01084093, + "balance_loss_clip": 1.02986944, + "balance_loss_mlp": 1.0035553, + "epoch": 0.4474237960680575, + "flos": 41427590515200.0, + "grad_norm": 1.6592972309706395, + "language_loss": 0.66450799, + "learning_rate": 2.434294501883254e-06, + "loss": 0.68650842, + "num_input_tokens_seen": 80106450, + "step": 3721, + "time_per_iteration": 3.1106836795806885 + }, + { + "auxiliary_loss_clip": 0.011284, + "auxiliary_loss_mlp": 0.01085882, + "balance_loss_clip": 1.0317347, + "balance_loss_mlp": 1.00524902, + "epoch": 0.44754403895869654, + "flos": 22891328991360.0, + "grad_norm": 1.6574653254081522, + "language_loss": 0.65792024, + "learning_rate": 2.433534084466674e-06, + "loss": 0.68006307, + "num_input_tokens_seen": 80125670, + "step": 3722, + "time_per_iteration": 3.720404624938965 + }, + { + "auxiliary_loss_clip": 0.01143109, + "auxiliary_loss_mlp": 0.01085883, + "balance_loss_clip": 1.03066468, + "balance_loss_mlp": 1.00520217, + "epoch": 0.44766428184933565, + "flos": 25630271832960.0, + "grad_norm": 1.573553388701234, + "language_loss": 0.70899999, + "learning_rate": 2.4327736012833178e-06, + "loss": 0.73128998, + "num_input_tokens_seen": 80147390, + "step": 3723, + "time_per_iteration": 2.724722146987915 + }, + { + "auxiliary_loss_clip": 0.01132305, + "auxiliary_loss_mlp": 0.01085615, + "balance_loss_clip": 1.0288794, + "balance_loss_mlp": 1.00493383, + "epoch": 0.44778452473997477, + "flos": 20448972748800.0, + "grad_norm": 2.110960130530687, + "language_loss": 0.76345628, + "learning_rate": 2.4320130524485506e-06, + "loss": 0.78563547, + "num_input_tokens_seen": 80166185, + "step": 3724, + "time_per_iteration": 2.6361730098724365 + }, + { + "auxiliary_loss_clip": 0.01120107, + "auxiliary_loss_mlp": 0.01086236, + "balance_loss_clip": 1.02824926, + "balance_loss_mlp": 1.00555563, + "epoch": 0.4479047676306138, + "flos": 21975462984960.0, + "grad_norm": 1.4374925165532888, + "language_loss": 0.7970494, + "learning_rate": 2.431252438077746e-06, + "loss": 0.81911278, + "num_input_tokens_seen": 80185685, + "step": 3725, + "time_per_iteration": 2.7903499603271484 + }, + { + "auxiliary_loss_clip": 0.01136419, + "auxiliary_loss_mlp": 0.00873202, + "balance_loss_clip": 1.03176379, + "balance_loss_mlp": 1.00024271, + "epoch": 0.44802501052125293, + "flos": 21467219495040.0, + "grad_norm": 4.573089429738885, + "language_loss": 0.76809448, + "learning_rate": 2.4304917582862906e-06, + "loss": 0.78819072, + "num_input_tokens_seen": 80204865, + "step": 3726, + "time_per_iteration": 2.71523380279541 + }, + { + "auxiliary_loss_clip": 0.01144343, + "auxiliary_loss_mlp": 0.01085319, + "balance_loss_clip": 1.03171468, + "balance_loss_mlp": 1.00468588, + "epoch": 0.44814525341189204, + "flos": 22126970551680.0, + "grad_norm": 1.860853950407689, + "language_loss": 0.87545705, + "learning_rate": 2.4297310131895774e-06, + "loss": 0.89775372, + "num_input_tokens_seen": 80223410, + "step": 3727, + "time_per_iteration": 2.6575915813446045 + }, + { + "auxiliary_loss_clip": 0.01132781, + "auxiliary_loss_mlp": 0.01085042, + "balance_loss_clip": 1.02948201, + "balance_loss_mlp": 1.00436151, + "epoch": 0.4482654963025311, + "flos": 16653933204480.0, + "grad_norm": 2.1930874447006796, + "language_loss": 0.7481845, + "learning_rate": 2.4289702029030113e-06, + "loss": 0.77036279, + "num_input_tokens_seen": 80240880, + "step": 3728, + "time_per_iteration": 3.627267837524414 + }, + { + "auxiliary_loss_clip": 0.01136148, + "auxiliary_loss_mlp": 0.01084681, + "balance_loss_clip": 1.03255141, + "balance_loss_mlp": 1.00404751, + "epoch": 0.4483857391931702, + "flos": 18841251905280.0, + "grad_norm": 1.8172875372291823, + "language_loss": 0.83086276, + "learning_rate": 2.4282093275420057e-06, + "loss": 0.85307103, + "num_input_tokens_seen": 80259910, + "step": 3729, + "time_per_iteration": 2.7027628421783447 + }, + { + "auxiliary_loss_clip": 0.01120956, + "auxiliary_loss_mlp": 0.01087091, + "balance_loss_clip": 1.03209662, + "balance_loss_mlp": 1.00631535, + "epoch": 0.4485059820838093, + "flos": 20372590477440.0, + "grad_norm": 1.9405783475630802, + "language_loss": 0.70574272, + "learning_rate": 2.4274483872219863e-06, + "loss": 0.72782326, + "num_input_tokens_seen": 80277270, + "step": 3730, + "time_per_iteration": 2.699394941329956 + }, + { + "auxiliary_loss_clip": 0.01135426, + "auxiliary_loss_mlp": 0.0108561, + "balance_loss_clip": 1.03148651, + "balance_loss_mlp": 1.00507188, + "epoch": 0.4486262249744484, + "flos": 20047742853120.0, + "grad_norm": 1.6553488524546638, + "language_loss": 0.93504375, + "learning_rate": 2.426687382058386e-06, + "loss": 0.95725417, + "num_input_tokens_seen": 80295550, + "step": 3731, + "time_per_iteration": 3.7241463661193848 + }, + { + "auxiliary_loss_clip": 0.01122474, + "auxiliary_loss_mlp": 0.01079815, + "balance_loss_clip": 1.03347111, + "balance_loss_mlp": 1.00046909, + "epoch": 0.4487464678650875, + "flos": 64595684776320.0, + "grad_norm": 0.870596932459431, + "language_loss": 0.59846568, + "learning_rate": 2.425926312166649e-06, + "loss": 0.62048858, + "num_input_tokens_seen": 80348425, + "step": 3732, + "time_per_iteration": 3.1203463077545166 + }, + { + "auxiliary_loss_clip": 0.01125403, + "auxiliary_loss_mlp": 0.01086142, + "balance_loss_clip": 1.0302043, + "balance_loss_mlp": 1.00531769, + "epoch": 0.4488667107557266, + "flos": 20769798049920.0, + "grad_norm": 1.9828902727237223, + "language_loss": 0.73253798, + "learning_rate": 2.42516517766223e-06, + "loss": 0.75465345, + "num_input_tokens_seen": 80366505, + "step": 3733, + "time_per_iteration": 3.689493179321289 + }, + { + "auxiliary_loss_clip": 0.01145337, + "auxiliary_loss_mlp": 0.01086351, + "balance_loss_clip": 1.03299856, + "balance_loss_mlp": 1.00557458, + "epoch": 0.44898695364636565, + "flos": 23951735326080.0, + "grad_norm": 1.801278292099805, + "language_loss": 0.68345481, + "learning_rate": 2.4244039786605907e-06, + "loss": 0.70577168, + "num_input_tokens_seen": 80387510, + "step": 3734, + "time_per_iteration": 2.677967071533203 + }, + { + "auxiliary_loss_clip": 0.01107373, + "auxiliary_loss_mlp": 0.01085349, + "balance_loss_clip": 1.02778792, + "balance_loss_mlp": 1.00466824, + "epoch": 0.44910719653700476, + "flos": 18624351628800.0, + "grad_norm": 8.732330692372784, + "language_loss": 0.82647598, + "learning_rate": 2.4236427152772055e-06, + "loss": 0.84840322, + "num_input_tokens_seen": 80405915, + "step": 3735, + "time_per_iteration": 2.9561445713043213 + }, + { + "auxiliary_loss_clip": 0.01098338, + "auxiliary_loss_mlp": 0.01080273, + "balance_loss_clip": 1.02707434, + "balance_loss_mlp": 1.00092757, + "epoch": 0.4492274394276438, + "flos": 57033435749760.0, + "grad_norm": 0.827507286092444, + "language_loss": 0.57330537, + "learning_rate": 2.422881387627557e-06, + "loss": 0.59509146, + "num_input_tokens_seen": 80458365, + "step": 3736, + "time_per_iteration": 3.013640880584717 + }, + { + "auxiliary_loss_clip": 0.01111764, + "auxiliary_loss_mlp": 0.01085078, + "balance_loss_clip": 1.03249764, + "balance_loss_mlp": 1.00434971, + "epoch": 0.4493476823182829, + "flos": 23254888498560.0, + "grad_norm": 1.800329230705561, + "language_loss": 0.77417541, + "learning_rate": 2.422119995827139e-06, + "loss": 0.79614377, + "num_input_tokens_seen": 80478490, + "step": 3737, + "time_per_iteration": 2.7072999477386475 + }, + { + "auxiliary_loss_clip": 0.01121866, + "auxiliary_loss_mlp": 0.01085544, + "balance_loss_clip": 1.03347611, + "balance_loss_mlp": 1.00476766, + "epoch": 0.44946792520892204, + "flos": 15815131827840.0, + "grad_norm": 2.7650028457579743, + "language_loss": 0.73816764, + "learning_rate": 2.4213585399914528e-06, + "loss": 0.76024175, + "num_input_tokens_seen": 80495695, + "step": 3738, + "time_per_iteration": 2.7160379886627197 + }, + { + "auxiliary_loss_clip": 0.01134249, + "auxiliary_loss_mlp": 0.01084809, + "balance_loss_clip": 1.03108263, + "balance_loss_mlp": 1.00431895, + "epoch": 0.4495881680995611, + "flos": 19610063631360.0, + "grad_norm": 1.6136557443877717, + "language_loss": 0.85438812, + "learning_rate": 2.4205970202360113e-06, + "loss": 0.87657869, + "num_input_tokens_seen": 80515260, + "step": 3739, + "time_per_iteration": 2.677361011505127 + }, + { + "auxiliary_loss_clip": 0.01106498, + "auxiliary_loss_mlp": 0.01084862, + "balance_loss_clip": 1.02910495, + "balance_loss_mlp": 1.00422871, + "epoch": 0.4497084109902002, + "flos": 26031465815040.0, + "grad_norm": 1.7865290679785029, + "language_loss": 0.78173804, + "learning_rate": 2.4198354366763354e-06, + "loss": 0.80365163, + "num_input_tokens_seen": 80533900, + "step": 3740, + "time_per_iteration": 2.894467353820801 + }, + { + "auxiliary_loss_clip": 0.01124741, + "auxiliary_loss_mlp": 0.0108459, + "balance_loss_clip": 1.02994084, + "balance_loss_mlp": 1.00390875, + "epoch": 0.4498286538808393, + "flos": 14793688771200.0, + "grad_norm": 1.9215055609869016, + "language_loss": 0.78540242, + "learning_rate": 2.4190737894279587e-06, + "loss": 0.80749571, + "num_input_tokens_seen": 80551270, + "step": 3741, + "time_per_iteration": 2.7381129264831543 + }, + { + "auxiliary_loss_clip": 0.01119566, + "auxiliary_loss_mlp": 0.01084366, + "balance_loss_clip": 1.03098035, + "balance_loss_mlp": 1.00382805, + "epoch": 0.44994889677147837, + "flos": 15450171690240.0, + "grad_norm": 2.320327415208935, + "language_loss": 0.80083185, + "learning_rate": 2.4183120786064203e-06, + "loss": 0.82287115, + "num_input_tokens_seen": 80568145, + "step": 3742, + "time_per_iteration": 2.832321882247925 + }, + { + "auxiliary_loss_clip": 0.01134591, + "auxiliary_loss_mlp": 0.00873102, + "balance_loss_clip": 1.0313096, + "balance_loss_mlp": 1.00027394, + "epoch": 0.4500691396621175, + "flos": 21798316085760.0, + "grad_norm": 3.654159622243258, + "language_loss": 0.85707963, + "learning_rate": 2.417550304327273e-06, + "loss": 0.8771565, + "num_input_tokens_seen": 80586185, + "step": 3743, + "time_per_iteration": 2.747884750366211 + }, + { + "auxiliary_loss_clip": 0.0114508, + "auxiliary_loss_mlp": 0.01086526, + "balance_loss_clip": 1.0321734, + "balance_loss_mlp": 1.00570178, + "epoch": 0.4501893825527566, + "flos": 32382016421760.0, + "grad_norm": 1.5298953326859417, + "language_loss": 0.75980884, + "learning_rate": 2.4167884667060763e-06, + "loss": 0.78212488, + "num_input_tokens_seen": 80608895, + "step": 3744, + "time_per_iteration": 2.8668177127838135 + }, + { + "auxiliary_loss_clip": 0.01128051, + "auxiliary_loss_mlp": 0.01087018, + "balance_loss_clip": 1.030828, + "balance_loss_mlp": 1.00633717, + "epoch": 0.45030962544339564, + "flos": 16544944362240.0, + "grad_norm": 2.4152089959950485, + "language_loss": 0.86919796, + "learning_rate": 2.4160265658584e-06, + "loss": 0.89134866, + "num_input_tokens_seen": 80623785, + "step": 3745, + "time_per_iteration": 2.8077378273010254 + }, + { + "auxiliary_loss_clip": 0.01136544, + "auxiliary_loss_mlp": 0.01084584, + "balance_loss_clip": 1.0325911, + "balance_loss_mlp": 1.00395072, + "epoch": 0.45042986833403476, + "flos": 19573039687680.0, + "grad_norm": 2.028137062204733, + "language_loss": 0.6812433, + "learning_rate": 2.4152646018998253e-06, + "loss": 0.70345449, + "num_input_tokens_seen": 80642735, + "step": 3746, + "time_per_iteration": 2.737630605697632 + }, + { + "auxiliary_loss_clip": 0.01133521, + "auxiliary_loss_mlp": 0.0108589, + "balance_loss_clip": 1.03101492, + "balance_loss_mlp": 1.00520909, + "epoch": 0.45055011122467387, + "flos": 23112467072640.0, + "grad_norm": 1.6529530614816195, + "language_loss": 0.71811485, + "learning_rate": 2.4145025749459403e-06, + "loss": 0.740309, + "num_input_tokens_seen": 80663760, + "step": 3747, + "time_per_iteration": 2.767508029937744 + }, + { + "auxiliary_loss_clip": 0.01078793, + "auxiliary_loss_mlp": 0.01085637, + "balance_loss_clip": 1.02231395, + "balance_loss_mlp": 1.00486124, + "epoch": 0.4506703541153129, + "flos": 19934623946880.0, + "grad_norm": 1.783180362901887, + "language_loss": 0.69893909, + "learning_rate": 2.413740485112344e-06, + "loss": 0.72058332, + "num_input_tokens_seen": 80682100, + "step": 3748, + "time_per_iteration": 3.8841090202331543 + }, + { + "auxiliary_loss_clip": 0.01120131, + "auxiliary_loss_mlp": 0.01084145, + "balance_loss_clip": 1.02778363, + "balance_loss_mlp": 1.0037024, + "epoch": 0.45079059700595203, + "flos": 19499530504320.0, + "grad_norm": 1.6043831052235964, + "language_loss": 0.82189715, + "learning_rate": 2.412978332514646e-06, + "loss": 0.8439399, + "num_input_tokens_seen": 80700880, + "step": 3749, + "time_per_iteration": 2.7886664867401123 + }, + { + "auxiliary_loss_clip": 0.01124068, + "auxiliary_loss_mlp": 0.01085455, + "balance_loss_clip": 1.02997208, + "balance_loss_mlp": 1.00472629, + "epoch": 0.4509108398965911, + "flos": 27636313570560.0, + "grad_norm": 2.1627696238574594, + "language_loss": 0.72176766, + "learning_rate": 2.4122161172684623e-06, + "loss": 0.74386287, + "num_input_tokens_seen": 80721675, + "step": 3750, + "time_per_iteration": 2.878767490386963 + }, + { + "auxiliary_loss_clip": 0.01109266, + "auxiliary_loss_mlp": 0.01084796, + "balance_loss_clip": 1.02957392, + "balance_loss_mlp": 1.00425816, + "epoch": 0.4510310827872302, + "flos": 20995712640000.0, + "grad_norm": 2.158956751777378, + "language_loss": 0.84183693, + "learning_rate": 2.4114538394894216e-06, + "loss": 0.86377764, + "num_input_tokens_seen": 80739315, + "step": 3751, + "time_per_iteration": 2.7111635208129883 + }, + { + "auxiliary_loss_clip": 0.01127536, + "auxiliary_loss_mlp": 0.0108565, + "balance_loss_clip": 1.0310514, + "balance_loss_mlp": 1.00511229, + "epoch": 0.4511513256778693, + "flos": 16216684945920.0, + "grad_norm": 1.7410818963668133, + "language_loss": 0.83167571, + "learning_rate": 2.410691499293161e-06, + "loss": 0.85380757, + "num_input_tokens_seen": 80757470, + "step": 3752, + "time_per_iteration": 2.7254014015197754 + }, + { + "auxiliary_loss_clip": 0.01134467, + "auxiliary_loss_mlp": 0.0108568, + "balance_loss_clip": 1.03084803, + "balance_loss_mlp": 1.00495124, + "epoch": 0.45127156856850836, + "flos": 25186702780800.0, + "grad_norm": 1.574574329241052, + "language_loss": 0.74144125, + "learning_rate": 2.409929096795326e-06, + "loss": 0.76364267, + "num_input_tokens_seen": 80777840, + "step": 3753, + "time_per_iteration": 2.7340996265411377 + }, + { + "auxiliary_loss_clip": 0.01135566, + "auxiliary_loss_mlp": 0.01085235, + "balance_loss_clip": 1.0309844, + "balance_loss_mlp": 1.00450635, + "epoch": 0.4513918114591475, + "flos": 20412523422720.0, + "grad_norm": 1.8513205229758605, + "language_loss": 0.79021597, + "learning_rate": 2.409166632111573e-06, + "loss": 0.81242394, + "num_input_tokens_seen": 80795975, + "step": 3754, + "time_per_iteration": 3.608224868774414 + }, + { + "auxiliary_loss_clip": 0.01137637, + "auxiliary_loss_mlp": 0.01084973, + "balance_loss_clip": 1.03222227, + "balance_loss_mlp": 1.00414896, + "epoch": 0.4515120543497866, + "flos": 26648482665600.0, + "grad_norm": 1.8781766486743106, + "language_loss": 0.80175281, + "learning_rate": 2.4084041053575674e-06, + "loss": 0.8239789, + "num_input_tokens_seen": 80815395, + "step": 3755, + "time_per_iteration": 2.8262388706207275 + }, + { + "auxiliary_loss_clip": 0.01111321, + "auxiliary_loss_mlp": 0.01085033, + "balance_loss_clip": 1.03183281, + "balance_loss_mlp": 1.00430465, + "epoch": 0.45163229724042564, + "flos": 20595093275520.0, + "grad_norm": 2.0293571581597396, + "language_loss": 0.72181726, + "learning_rate": 2.4076415166489834e-06, + "loss": 0.74378073, + "num_input_tokens_seen": 80834805, + "step": 3756, + "time_per_iteration": 2.8178725242614746 + }, + { + "auxiliary_loss_clip": 0.01090298, + "auxiliary_loss_mlp": 0.01085772, + "balance_loss_clip": 1.02772367, + "balance_loss_mlp": 1.00504351, + "epoch": 0.45175254013106475, + "flos": 21689004021120.0, + "grad_norm": 1.5835974183133055, + "language_loss": 0.78964663, + "learning_rate": 2.406878866101506e-06, + "loss": 0.81140721, + "num_input_tokens_seen": 80853770, + "step": 3757, + "time_per_iteration": 3.800278902053833 + }, + { + "auxiliary_loss_clip": 0.01145437, + "auxiliary_loss_mlp": 0.01085604, + "balance_loss_clip": 1.0333035, + "balance_loss_mlp": 1.00497067, + "epoch": 0.45187278302170386, + "flos": 18878850466560.0, + "grad_norm": 2.1001291067581676, + "language_loss": 0.78422612, + "learning_rate": 2.4061161538308273e-06, + "loss": 0.80653656, + "num_input_tokens_seen": 80870615, + "step": 3758, + "time_per_iteration": 2.6084752082824707 + }, + { + "auxiliary_loss_clip": 0.01134125, + "auxiliary_loss_mlp": 0.01084889, + "balance_loss_clip": 1.03094125, + "balance_loss_mlp": 1.00420833, + "epoch": 0.4519930259123429, + "flos": 18582479349120.0, + "grad_norm": 2.0840546988233943, + "language_loss": 0.88883519, + "learning_rate": 2.4053533799526523e-06, + "loss": 0.91102535, + "num_input_tokens_seen": 80886335, + "step": 3759, + "time_per_iteration": 3.5563721656799316 + }, + { + "auxiliary_loss_clip": 0.01124372, + "auxiliary_loss_mlp": 0.01083978, + "balance_loss_clip": 1.03115034, + "balance_loss_mlp": 1.0034399, + "epoch": 0.452113268802982, + "flos": 25192377129600.0, + "grad_norm": 1.7529340791174688, + "language_loss": 0.86561096, + "learning_rate": 2.404590544582691e-06, + "loss": 0.88769436, + "num_input_tokens_seen": 80904570, + "step": 3760, + "time_per_iteration": 2.880415678024292 + }, + { + "auxiliary_loss_clip": 0.01110673, + "auxiliary_loss_mlp": 0.01085352, + "balance_loss_clip": 1.03059661, + "balance_loss_mlp": 1.00467134, + "epoch": 0.45223351169362114, + "flos": 39378922312320.0, + "grad_norm": 1.6006700172962047, + "language_loss": 0.80906117, + "learning_rate": 2.403827647836666e-06, + "loss": 0.83102143, + "num_input_tokens_seen": 80925125, + "step": 3761, + "time_per_iteration": 2.92777943611145 + }, + { + "auxiliary_loss_clip": 0.01143857, + "auxiliary_loss_mlp": 0.01085141, + "balance_loss_clip": 1.03097892, + "balance_loss_mlp": 1.00446022, + "epoch": 0.4523537545842602, + "flos": 21582169994880.0, + "grad_norm": 1.833092704184352, + "language_loss": 0.69799966, + "learning_rate": 2.4030646898303075e-06, + "loss": 0.72028959, + "num_input_tokens_seen": 80946615, + "step": 3762, + "time_per_iteration": 2.7059757709503174 + }, + { + "auxiliary_loss_clip": 0.01126843, + "auxiliary_loss_mlp": 0.01085363, + "balance_loss_clip": 1.03092134, + "balance_loss_mlp": 1.00482559, + "epoch": 0.4524739974748993, + "flos": 28439527547520.0, + "grad_norm": 2.0897287093798282, + "language_loss": 0.81930727, + "learning_rate": 2.4023016706793566e-06, + "loss": 0.84142929, + "num_input_tokens_seen": 80966410, + "step": 3763, + "time_per_iteration": 2.764153480529785 + }, + { + "auxiliary_loss_clip": 0.01114258, + "auxiliary_loss_mlp": 0.0107975, + "balance_loss_clip": 1.04067612, + "balance_loss_mlp": 1.0004046, + "epoch": 0.4525942403655384, + "flos": 61556492148480.0, + "grad_norm": 6.047521015148458, + "language_loss": 0.56848919, + "learning_rate": 2.401538590499561e-06, + "loss": 0.59042919, + "num_input_tokens_seen": 81026865, + "step": 3764, + "time_per_iteration": 3.358473062515259 + }, + { + "auxiliary_loss_clip": 0.01133812, + "auxiliary_loss_mlp": 0.0087308, + "balance_loss_clip": 1.03003871, + "balance_loss_mlp": 1.00022197, + "epoch": 0.45271448325617747, + "flos": 27529838680320.0, + "grad_norm": 1.9012393511267949, + "language_loss": 0.72327209, + "learning_rate": 2.400775449406682e-06, + "loss": 0.74334103, + "num_input_tokens_seen": 81050060, + "step": 3765, + "time_per_iteration": 2.821119785308838 + }, + { + "auxiliary_loss_clip": 0.01135405, + "auxiliary_loss_mlp": 0.01085511, + "balance_loss_clip": 1.03085971, + "balance_loss_mlp": 1.00483012, + "epoch": 0.4528347261468166, + "flos": 22452608275200.0, + "grad_norm": 1.7969114433295736, + "language_loss": 0.72589236, + "learning_rate": 2.400012247516485e-06, + "loss": 0.74810159, + "num_input_tokens_seen": 81070625, + "step": 3766, + "time_per_iteration": 2.7433524131774902 + }, + { + "auxiliary_loss_clip": 0.01115469, + "auxiliary_loss_mlp": 0.01085392, + "balance_loss_clip": 1.02945614, + "balance_loss_mlp": 1.00471103, + "epoch": 0.45295496903745563, + "flos": 21103875469440.0, + "grad_norm": 1.696403300061868, + "language_loss": 0.90376848, + "learning_rate": 2.3992489849447484e-06, + "loss": 0.92577708, + "num_input_tokens_seen": 81089080, + "step": 3767, + "time_per_iteration": 2.859679698944092 + }, + { + "auxiliary_loss_clip": 0.0110053, + "auxiliary_loss_mlp": 0.01087266, + "balance_loss_clip": 1.02876747, + "balance_loss_mlp": 1.00663257, + "epoch": 0.45307521192809475, + "flos": 23221168606080.0, + "grad_norm": 1.5348903562330611, + "language_loss": 0.79136813, + "learning_rate": 2.3984856618072584e-06, + "loss": 0.81324601, + "num_input_tokens_seen": 81109115, + "step": 3768, + "time_per_iteration": 2.7669808864593506 + }, + { + "auxiliary_loss_clip": 0.01119019, + "auxiliary_loss_mlp": 0.01084744, + "balance_loss_clip": 1.03111339, + "balance_loss_mlp": 1.00411105, + "epoch": 0.45319545481873386, + "flos": 15560094286080.0, + "grad_norm": 2.1981976877116955, + "language_loss": 0.73415875, + "learning_rate": 2.3977222782198098e-06, + "loss": 0.75619644, + "num_input_tokens_seen": 81127750, + "step": 3769, + "time_per_iteration": 2.8317761421203613 + }, + { + "auxiliary_loss_clip": 0.01111714, + "auxiliary_loss_mlp": 0.01085166, + "balance_loss_clip": 1.02651703, + "balance_loss_mlp": 1.00443721, + "epoch": 0.4533156977093729, + "flos": 21944759834880.0, + "grad_norm": 1.6290269747481763, + "language_loss": 0.75376964, + "learning_rate": 2.3969588342982077e-06, + "loss": 0.77573842, + "num_input_tokens_seen": 81147125, + "step": 3770, + "time_per_iteration": 2.8700766563415527 + }, + { + "auxiliary_loss_clip": 0.01135106, + "auxiliary_loss_mlp": 0.01084528, + "balance_loss_clip": 1.03221107, + "balance_loss_mlp": 1.00384724, + "epoch": 0.453435940600012, + "flos": 24242180699520.0, + "grad_norm": 1.473185115724376, + "language_loss": 0.72456312, + "learning_rate": 2.396195330158267e-06, + "loss": 0.74675947, + "num_input_tokens_seen": 81167015, + "step": 3771, + "time_per_iteration": 2.831754446029663 + }, + { + "auxiliary_loss_clip": 0.01145051, + "auxiliary_loss_mlp": 0.01086873, + "balance_loss_clip": 1.03207076, + "balance_loss_mlp": 1.0060488, + "epoch": 0.45355618349065113, + "flos": 23440367352960.0, + "grad_norm": 1.7557089295111294, + "language_loss": 0.79372728, + "learning_rate": 2.3954317659158094e-06, + "loss": 0.81604654, + "num_input_tokens_seen": 81187350, + "step": 3772, + "time_per_iteration": 2.756652593612671 + }, + { + "auxiliary_loss_clip": 0.01136204, + "auxiliary_loss_mlp": 0.01079461, + "balance_loss_clip": 1.03903532, + "balance_loss_mlp": 1.00011563, + "epoch": 0.4536764263812902, + "flos": 66903161448960.0, + "grad_norm": 0.895955853180478, + "language_loss": 0.56944293, + "learning_rate": 2.394668141686667e-06, + "loss": 0.59159958, + "num_input_tokens_seen": 81249315, + "step": 3773, + "time_per_iteration": 3.260545492172241 + }, + { + "auxiliary_loss_clip": 0.01135451, + "auxiliary_loss_mlp": 0.01084949, + "balance_loss_clip": 1.03097725, + "balance_loss_mlp": 1.00431597, + "epoch": 0.4537966692719293, + "flos": 42739766254080.0, + "grad_norm": 1.850812249368942, + "language_loss": 0.69467795, + "learning_rate": 2.3939044575866813e-06, + "loss": 0.71688193, + "num_input_tokens_seen": 81272065, + "step": 3774, + "time_per_iteration": 3.8221096992492676 + }, + { + "auxiliary_loss_clip": 0.01126776, + "auxiliary_loss_mlp": 0.00873204, + "balance_loss_clip": 1.03047919, + "balance_loss_mlp": 1.00033665, + "epoch": 0.4539169121625684, + "flos": 35549480517120.0, + "grad_norm": 2.1424319089506114, + "language_loss": 0.75354815, + "learning_rate": 2.3931407137317024e-06, + "loss": 0.77354795, + "num_input_tokens_seen": 81292220, + "step": 3775, + "time_per_iteration": 2.9045891761779785 + }, + { + "auxiliary_loss_clip": 0.01102817, + "auxiliary_loss_mlp": 0.01085908, + "balance_loss_clip": 1.03019571, + "balance_loss_mlp": 1.00517917, + "epoch": 0.45403715505320746, + "flos": 18514716341760.0, + "grad_norm": 1.6397642903445875, + "language_loss": 0.84772539, + "learning_rate": 2.3923769102375907e-06, + "loss": 0.86961257, + "num_input_tokens_seen": 81311085, + "step": 3776, + "time_per_iteration": 2.782961845397949 + }, + { + "auxiliary_loss_clip": 0.0111213, + "auxiliary_loss_mlp": 0.01085933, + "balance_loss_clip": 1.02651715, + "balance_loss_mlp": 1.00529945, + "epoch": 0.4541573979438466, + "flos": 25045825639680.0, + "grad_norm": 1.9930205191299983, + "language_loss": 0.78956443, + "learning_rate": 2.391613047220213e-06, + "loss": 0.81154513, + "num_input_tokens_seen": 81330985, + "step": 3777, + "time_per_iteration": 2.7883105278015137 + }, + { + "auxiliary_loss_clip": 0.01092091, + "auxiliary_loss_mlp": 0.01085255, + "balance_loss_clip": 1.02918315, + "balance_loss_mlp": 1.00457454, + "epoch": 0.4542776408344857, + "flos": 18332397884160.0, + "grad_norm": 1.9742095961618888, + "language_loss": 0.78737295, + "learning_rate": 2.390849124795447e-06, + "loss": 0.8091464, + "num_input_tokens_seen": 81346985, + "step": 3778, + "time_per_iteration": 2.753056287765503 + }, + { + "auxiliary_loss_clip": 0.01144876, + "auxiliary_loss_mlp": 0.01085197, + "balance_loss_clip": 1.03198266, + "balance_loss_mlp": 1.0045166, + "epoch": 0.45439788372512474, + "flos": 20701173116160.0, + "grad_norm": 2.2178222578070836, + "language_loss": 0.84203398, + "learning_rate": 2.3900851430791804e-06, + "loss": 0.86433476, + "num_input_tokens_seen": 81365005, + "step": 3779, + "time_per_iteration": 2.6703147888183594 + }, + { + "auxiliary_loss_clip": 0.01145213, + "auxiliary_loss_mlp": 0.01086415, + "balance_loss_clip": 1.03219116, + "balance_loss_mlp": 1.00549543, + "epoch": 0.45451812661576385, + "flos": 22309432663680.0, + "grad_norm": 2.149775817148954, + "language_loss": 0.84961927, + "learning_rate": 2.389321102187307e-06, + "loss": 0.87193555, + "num_input_tokens_seen": 81383785, + "step": 3780, + "time_per_iteration": 3.591179609298706 + }, + { + "auxiliary_loss_clip": 0.01121542, + "auxiliary_loss_mlp": 0.00873263, + "balance_loss_clip": 1.02788973, + "balance_loss_mlp": 1.00021791, + "epoch": 0.4546383695064029, + "flos": 21763303303680.0, + "grad_norm": 2.782892034479486, + "language_loss": 0.81875873, + "learning_rate": 2.3885570022357326e-06, + "loss": 0.83870673, + "num_input_tokens_seen": 81402915, + "step": 3781, + "time_per_iteration": 2.7716104984283447 + }, + { + "auxiliary_loss_clip": 0.01095497, + "auxiliary_loss_mlp": 0.01079543, + "balance_loss_clip": 1.03865552, + "balance_loss_mlp": 1.00019777, + "epoch": 0.454758612397042, + "flos": 64242755694720.0, + "grad_norm": 0.8138398475573376, + "language_loss": 0.60893601, + "learning_rate": 2.38779284334037e-06, + "loss": 0.6306864, + "num_input_tokens_seen": 81467890, + "step": 3782, + "time_per_iteration": 4.329915285110474 + }, + { + "auxiliary_loss_clip": 0.01106201, + "auxiliary_loss_mlp": 0.01085245, + "balance_loss_clip": 1.02757573, + "balance_loss_mlp": 1.00451636, + "epoch": 0.4548788552876811, + "flos": 27304175485440.0, + "grad_norm": 1.7928833164857634, + "language_loss": 0.79079413, + "learning_rate": 2.387028625617141e-06, + "loss": 0.81270856, + "num_input_tokens_seen": 81487105, + "step": 3783, + "time_per_iteration": 2.8615474700927734 + }, + { + "auxiliary_loss_clip": 0.01125726, + "auxiliary_loss_mlp": 0.01084643, + "balance_loss_clip": 1.03054619, + "balance_loss_mlp": 1.00401008, + "epoch": 0.4549990981783202, + "flos": 22857142222080.0, + "grad_norm": 2.5310768368469057, + "language_loss": 0.84760755, + "learning_rate": 2.3862643491819766e-06, + "loss": 0.86971128, + "num_input_tokens_seen": 81505670, + "step": 3784, + "time_per_iteration": 3.716529607772827 + }, + { + "auxiliary_loss_clip": 0.01135798, + "auxiliary_loss_mlp": 0.01084792, + "balance_loss_clip": 1.03169477, + "balance_loss_mlp": 1.00420666, + "epoch": 0.4551193410689593, + "flos": 23258587599360.0, + "grad_norm": 1.7318283694114454, + "language_loss": 0.84257102, + "learning_rate": 2.3855000141508186e-06, + "loss": 0.86477685, + "num_input_tokens_seen": 81525825, + "step": 3785, + "time_per_iteration": 2.708555221557617 + }, + { + "auxiliary_loss_clip": 0.01120571, + "auxiliary_loss_mlp": 0.01085884, + "balance_loss_clip": 1.02742863, + "balance_loss_mlp": 1.00525057, + "epoch": 0.4552395839595984, + "flos": 20777519473920.0, + "grad_norm": 1.9646380887408488, + "language_loss": 0.8373394, + "learning_rate": 2.3847356206396143e-06, + "loss": 0.85940397, + "num_input_tokens_seen": 81543135, + "step": 3786, + "time_per_iteration": 2.7397987842559814 + }, + { + "auxiliary_loss_clip": 0.01144742, + "auxiliary_loss_mlp": 0.01085013, + "balance_loss_clip": 1.03224027, + "balance_loss_mlp": 1.00437939, + "epoch": 0.45535982685023746, + "flos": 23257510191360.0, + "grad_norm": 1.4502809697329018, + "language_loss": 0.78536475, + "learning_rate": 2.3839711687643227e-06, + "loss": 0.80766231, + "num_input_tokens_seen": 81564360, + "step": 3787, + "time_per_iteration": 2.679307222366333 + }, + { + "auxiliary_loss_clip": 0.01133484, + "auxiliary_loss_mlp": 0.01084218, + "balance_loss_clip": 1.03002977, + "balance_loss_mlp": 1.00344157, + "epoch": 0.45548006974087657, + "flos": 19646117907840.0, + "grad_norm": 1.8527983972278999, + "language_loss": 0.73798847, + "learning_rate": 2.38320665864091e-06, + "loss": 0.76016545, + "num_input_tokens_seen": 81583710, + "step": 3788, + "time_per_iteration": 2.6843433380126953 + }, + { + "auxiliary_loss_clip": 0.01096051, + "auxiliary_loss_mlp": 0.01084618, + "balance_loss_clip": 1.02649188, + "balance_loss_mlp": 1.0038898, + "epoch": 0.4556003126315157, + "flos": 20047778766720.0, + "grad_norm": 1.744427793147658, + "language_loss": 0.81979597, + "learning_rate": 2.3824420903853516e-06, + "loss": 0.84160268, + "num_input_tokens_seen": 81602175, + "step": 3789, + "time_per_iteration": 2.873150587081909 + }, + { + "auxiliary_loss_clip": 0.01133487, + "auxiliary_loss_mlp": 0.01086625, + "balance_loss_clip": 1.03090382, + "balance_loss_mlp": 1.00589609, + "epoch": 0.45572055552215474, + "flos": 22959738443520.0, + "grad_norm": 2.1434553280543085, + "language_loss": 0.82407719, + "learning_rate": 2.3816774641136324e-06, + "loss": 0.84627831, + "num_input_tokens_seen": 81619430, + "step": 3790, + "time_per_iteration": 2.649934768676758 + }, + { + "auxiliary_loss_clip": 0.01133951, + "auxiliary_loss_mlp": 0.00873133, + "balance_loss_clip": 1.03055811, + "balance_loss_mlp": 1.00019193, + "epoch": 0.45584079841279385, + "flos": 33109925535360.0, + "grad_norm": 1.7381103428356985, + "language_loss": 0.71347219, + "learning_rate": 2.380912779941745e-06, + "loss": 0.73354304, + "num_input_tokens_seen": 81642550, + "step": 3791, + "time_per_iteration": 2.7716610431671143 + }, + { + "auxiliary_loss_clip": 0.01136262, + "auxiliary_loss_mlp": 0.01087135, + "balance_loss_clip": 1.03110933, + "balance_loss_mlp": 1.00626278, + "epoch": 0.45596104130343296, + "flos": 27272179445760.0, + "grad_norm": 7.302834116547749, + "language_loss": 0.83067143, + "learning_rate": 2.3801480379856918e-06, + "loss": 0.85290533, + "num_input_tokens_seen": 81664260, + "step": 3792, + "time_per_iteration": 2.837294578552246 + }, + { + "auxiliary_loss_clip": 0.01123741, + "auxiliary_loss_mlp": 0.01085932, + "balance_loss_clip": 1.02918875, + "balance_loss_mlp": 1.00534701, + "epoch": 0.456081284194072, + "flos": 21579799697280.0, + "grad_norm": 1.6719112001963683, + "language_loss": 0.83876985, + "learning_rate": 2.379383238361484e-06, + "loss": 0.86086661, + "num_input_tokens_seen": 81683620, + "step": 3793, + "time_per_iteration": 2.684326648712158 + }, + { + "auxiliary_loss_clip": 0.01133443, + "auxiliary_loss_mlp": 0.01085085, + "balance_loss_clip": 1.03045511, + "balance_loss_mlp": 1.00449944, + "epoch": 0.4562015270847111, + "flos": 35918822113920.0, + "grad_norm": 1.8748477759800715, + "language_loss": 0.79298842, + "learning_rate": 2.3786183811851407e-06, + "loss": 0.81517375, + "num_input_tokens_seen": 81704325, + "step": 3794, + "time_per_iteration": 2.8015987873077393 + }, + { + "auxiliary_loss_clip": 0.01144353, + "auxiliary_loss_mlp": 0.01085574, + "balance_loss_clip": 1.03193867, + "balance_loss_mlp": 1.00489306, + "epoch": 0.45632176997535023, + "flos": 13589783602560.0, + "grad_norm": 1.7492663477862187, + "language_loss": 0.79994243, + "learning_rate": 2.3778534665726892e-06, + "loss": 0.82224172, + "num_input_tokens_seen": 81721155, + "step": 3795, + "time_per_iteration": 2.6154322624206543 + }, + { + "auxiliary_loss_clip": 0.01136889, + "auxiliary_loss_mlp": 0.01084299, + "balance_loss_clip": 1.0326997, + "balance_loss_mlp": 1.00380909, + "epoch": 0.4564420128659893, + "flos": 32635401937920.0, + "grad_norm": 1.9432429110337144, + "language_loss": 0.72307926, + "learning_rate": 2.377088494640168e-06, + "loss": 0.74529111, + "num_input_tokens_seen": 81742905, + "step": 3796, + "time_per_iteration": 2.8279294967651367 + }, + { + "auxiliary_loss_clip": 0.01126411, + "auxiliary_loss_mlp": 0.01085011, + "balance_loss_clip": 1.02907205, + "balance_loss_mlp": 1.00442564, + "epoch": 0.4565622557566284, + "flos": 20377690208640.0, + "grad_norm": 1.8040282153162783, + "language_loss": 0.7798866, + "learning_rate": 2.3763234655036216e-06, + "loss": 0.80200088, + "num_input_tokens_seen": 81762105, + "step": 3797, + "time_per_iteration": 2.7361342906951904 + }, + { + "auxiliary_loss_clip": 0.01119446, + "auxiliary_loss_mlp": 0.0108569, + "balance_loss_clip": 1.03018558, + "balance_loss_mlp": 1.00515211, + "epoch": 0.45668249864726745, + "flos": 25374372364800.0, + "grad_norm": 6.48861441793048, + "language_loss": 0.87538946, + "learning_rate": 2.3755583792791046e-06, + "loss": 0.89744079, + "num_input_tokens_seen": 81781975, + "step": 3798, + "time_per_iteration": 2.794726610183716 + }, + { + "auxiliary_loss_clip": 0.01133956, + "auxiliary_loss_mlp": 0.0108564, + "balance_loss_clip": 1.0303899, + "balance_loss_mlp": 1.0048641, + "epoch": 0.45680274153790656, + "flos": 15559806977280.0, + "grad_norm": 2.04296480750872, + "language_loss": 0.74636877, + "learning_rate": 2.3747932360826803e-06, + "loss": 0.7685647, + "num_input_tokens_seen": 81798905, + "step": 3799, + "time_per_iteration": 3.6024863719940186 + }, + { + "auxiliary_loss_clip": 0.0113432, + "auxiliary_loss_mlp": 0.01085782, + "balance_loss_clip": 1.03072071, + "balance_loss_mlp": 1.00500596, + "epoch": 0.4569229844285457, + "flos": 19792884879360.0, + "grad_norm": 2.279542056716531, + "language_loss": 0.82262975, + "learning_rate": 2.3740280360304205e-06, + "loss": 0.84483075, + "num_input_tokens_seen": 81816630, + "step": 3800, + "time_per_iteration": 2.6265528202056885 + }, + { + "auxiliary_loss_clip": 0.01115303, + "auxiliary_loss_mlp": 0.01084763, + "balance_loss_clip": 1.03044176, + "balance_loss_mlp": 1.0042733, + "epoch": 0.45704322731918473, + "flos": 24093941270400.0, + "grad_norm": 1.8229153897025263, + "language_loss": 0.67705142, + "learning_rate": 2.3732627792384038e-06, + "loss": 0.6990521, + "num_input_tokens_seen": 81837700, + "step": 3801, + "time_per_iteration": 2.8115246295928955 + }, + { + "auxiliary_loss_clip": 0.0114568, + "auxiliary_loss_mlp": 0.01084527, + "balance_loss_clip": 1.03264797, + "balance_loss_mlp": 1.00384641, + "epoch": 0.45716347020982384, + "flos": 31317803245440.0, + "grad_norm": 1.816197810609129, + "language_loss": 0.75578004, + "learning_rate": 2.3724974658227207e-06, + "loss": 0.77808207, + "num_input_tokens_seen": 81858490, + "step": 3802, + "time_per_iteration": 2.7535812854766846 + }, + { + "auxiliary_loss_clip": 0.01122784, + "auxiliary_loss_mlp": 0.0087319, + "balance_loss_clip": 1.02906513, + "balance_loss_mlp": 1.00016952, + "epoch": 0.45728371310046295, + "flos": 26501392471680.0, + "grad_norm": 1.844062989804071, + "language_loss": 0.7118513, + "learning_rate": 2.3717320958994687e-06, + "loss": 0.73181105, + "num_input_tokens_seen": 81876050, + "step": 3803, + "time_per_iteration": 2.8081345558166504 + }, + { + "auxiliary_loss_clip": 0.01119139, + "auxiliary_loss_mlp": 0.01085265, + "balance_loss_clip": 1.03025913, + "balance_loss_mlp": 1.00463152, + "epoch": 0.457403955991102, + "flos": 17929408222080.0, + "grad_norm": 1.9348647431883488, + "language_loss": 0.70641792, + "learning_rate": 2.3709666695847534e-06, + "loss": 0.72846198, + "num_input_tokens_seen": 81894230, + "step": 3804, + "time_per_iteration": 2.7124738693237305 + }, + { + "auxiliary_loss_clip": 0.01108811, + "auxiliary_loss_mlp": 0.01085274, + "balance_loss_clip": 1.03010464, + "balance_loss_mlp": 1.00468838, + "epoch": 0.4575241988817411, + "flos": 42230660837760.0, + "grad_norm": 1.5980371567319804, + "language_loss": 0.70081717, + "learning_rate": 2.370201186994689e-06, + "loss": 0.72275805, + "num_input_tokens_seen": 81917915, + "step": 3805, + "time_per_iteration": 3.8834328651428223 + }, + { + "auxiliary_loss_clip": 0.01126584, + "auxiliary_loss_mlp": 0.01086723, + "balance_loss_clip": 1.03181267, + "balance_loss_mlp": 1.00599444, + "epoch": 0.45764444177238023, + "flos": 30117309868800.0, + "grad_norm": 1.7670569033947108, + "language_loss": 0.69866955, + "learning_rate": 2.369435648245399e-06, + "loss": 0.72080266, + "num_input_tokens_seen": 81938130, + "step": 3806, + "time_per_iteration": 2.8114566802978516 + }, + { + "auxiliary_loss_clip": 0.01123706, + "auxiliary_loss_mlp": 0.01085548, + "balance_loss_clip": 1.02953804, + "balance_loss_mlp": 1.00481915, + "epoch": 0.4577646846630193, + "flos": 24060293205120.0, + "grad_norm": 1.6775403497098176, + "language_loss": 0.85105622, + "learning_rate": 2.368670053453015e-06, + "loss": 0.8731488, + "num_input_tokens_seen": 81959820, + "step": 3807, + "time_per_iteration": 3.7386388778686523 + }, + { + "auxiliary_loss_clip": 0.01135517, + "auxiliary_loss_mlp": 0.01086144, + "balance_loss_clip": 1.03069222, + "balance_loss_mlp": 1.0054158, + "epoch": 0.4578849275536584, + "flos": 17418578952960.0, + "grad_norm": 2.306368654133814, + "language_loss": 0.74172556, + "learning_rate": 2.3679044027336757e-06, + "loss": 0.76394212, + "num_input_tokens_seen": 81975710, + "step": 3808, + "time_per_iteration": 2.728854179382324 + }, + { + "auxiliary_loss_clip": 0.01144315, + "auxiliary_loss_mlp": 0.01086284, + "balance_loss_clip": 1.03159547, + "balance_loss_mlp": 1.00545967, + "epoch": 0.4580051704442975, + "flos": 13510169107200.0, + "grad_norm": 3.8818439343388236, + "language_loss": 0.69372153, + "learning_rate": 2.3671386962035326e-06, + "loss": 0.7160275, + "num_input_tokens_seen": 81993180, + "step": 3809, + "time_per_iteration": 2.640852689743042 + }, + { + "auxiliary_loss_clip": 0.01133787, + "auxiliary_loss_mlp": 0.01085657, + "balance_loss_clip": 1.03042614, + "balance_loss_mlp": 1.00483298, + "epoch": 0.45812541333493656, + "flos": 18037606965120.0, + "grad_norm": 1.988644600353848, + "language_loss": 0.68875182, + "learning_rate": 2.3663729339787405e-06, + "loss": 0.71094632, + "num_input_tokens_seen": 82010115, + "step": 3810, + "time_per_iteration": 3.619274854660034 + }, + { + "auxiliary_loss_clip": 0.01143624, + "auxiliary_loss_mlp": 0.0108564, + "balance_loss_clip": 1.03152919, + "balance_loss_mlp": 1.0048641, + "epoch": 0.45824565622557567, + "flos": 20222196232320.0, + "grad_norm": 2.403023332737483, + "language_loss": 0.73631126, + "learning_rate": 2.365607116175466e-06, + "loss": 0.75860393, + "num_input_tokens_seen": 82025540, + "step": 3811, + "time_per_iteration": 2.656525135040283 + }, + { + "auxiliary_loss_clip": 0.01143947, + "auxiliary_loss_mlp": 0.01084867, + "balance_loss_clip": 1.03178227, + "balance_loss_mlp": 1.004282, + "epoch": 0.4583658991162148, + "flos": 19864885691520.0, + "grad_norm": 2.9198216333365097, + "language_loss": 0.67039186, + "learning_rate": 2.3648412429098825e-06, + "loss": 0.69268, + "num_input_tokens_seen": 82043890, + "step": 3812, + "time_per_iteration": 2.607335329055786 + }, + { + "auxiliary_loss_clip": 0.01115812, + "auxiliary_loss_mlp": 0.0108616, + "balance_loss_clip": 1.02947438, + "balance_loss_mlp": 1.00528824, + "epoch": 0.45848614200685384, + "flos": 21029935322880.0, + "grad_norm": 1.6828697300728566, + "language_loss": 0.82033646, + "learning_rate": 2.364075314298172e-06, + "loss": 0.84235609, + "num_input_tokens_seen": 82061345, + "step": 3813, + "time_per_iteration": 2.7802977561950684 + }, + { + "auxiliary_loss_clip": 0.01136649, + "auxiliary_loss_mlp": 0.00873191, + "balance_loss_clip": 1.03229988, + "balance_loss_mlp": 1.00019157, + "epoch": 0.45860638489749295, + "flos": 21069293650560.0, + "grad_norm": 1.7728608495650227, + "language_loss": 0.70125991, + "learning_rate": 2.3633093304565267e-06, + "loss": 0.7213583, + "num_input_tokens_seen": 82080400, + "step": 3814, + "time_per_iteration": 2.6654486656188965 + }, + { + "auxiliary_loss_clip": 0.01145746, + "auxiliary_loss_mlp": 0.01085635, + "balance_loss_clip": 1.03259671, + "balance_loss_mlp": 1.00500178, + "epoch": 0.458726627788132, + "flos": 26833889692800.0, + "grad_norm": 1.8474074687153628, + "language_loss": 0.6305964, + "learning_rate": 2.3625432915011443e-06, + "loss": 0.65291023, + "num_input_tokens_seen": 82102310, + "step": 3815, + "time_per_iteration": 2.685431718826294 + }, + { + "auxiliary_loss_clip": 0.01125162, + "auxiliary_loss_mlp": 0.01086353, + "balance_loss_clip": 1.03034782, + "balance_loss_mlp": 1.00567222, + "epoch": 0.4588468706787711, + "flos": 24097927680000.0, + "grad_norm": 1.951105704684719, + "language_loss": 0.65487564, + "learning_rate": 2.3617771975482334e-06, + "loss": 0.67699081, + "num_input_tokens_seen": 82121140, + "step": 3816, + "time_per_iteration": 2.78863525390625 + }, + { + "auxiliary_loss_clip": 0.01107687, + "auxiliary_loss_mlp": 0.01085922, + "balance_loss_clip": 1.0290314, + "balance_loss_mlp": 1.00538421, + "epoch": 0.4589671135694102, + "flos": 17889331622400.0, + "grad_norm": 1.6034901787253466, + "language_loss": 0.74587065, + "learning_rate": 2.3610110487140083e-06, + "loss": 0.76780671, + "num_input_tokens_seen": 82139575, + "step": 3817, + "time_per_iteration": 2.800201654434204 + }, + { + "auxiliary_loss_clip": 0.01121093, + "auxiliary_loss_mlp": 0.01085441, + "balance_loss_clip": 1.02781439, + "balance_loss_mlp": 1.00490332, + "epoch": 0.4590873564600493, + "flos": 25626967781760.0, + "grad_norm": 1.7394853990376284, + "language_loss": 0.80673915, + "learning_rate": 2.360244845114695e-06, + "loss": 0.82880449, + "num_input_tokens_seen": 82159195, + "step": 3818, + "time_per_iteration": 2.849888324737549 + }, + { + "auxiliary_loss_clip": 0.01118489, + "auxiliary_loss_mlp": 0.01086583, + "balance_loss_clip": 1.026667, + "balance_loss_mlp": 1.00585461, + "epoch": 0.4592075993506884, + "flos": 18514788168960.0, + "grad_norm": 2.395141803236091, + "language_loss": 0.68825901, + "learning_rate": 2.3594785868665245e-06, + "loss": 0.71030974, + "num_input_tokens_seen": 82175500, + "step": 3819, + "time_per_iteration": 2.7552542686462402 + }, + { + "auxiliary_loss_clip": 0.01114528, + "auxiliary_loss_mlp": 0.00873131, + "balance_loss_clip": 1.02866113, + "balance_loss_mlp": 1.00012803, + "epoch": 0.4593278422413275, + "flos": 20631111638400.0, + "grad_norm": 2.043904745756592, + "language_loss": 0.80883336, + "learning_rate": 2.3587122740857386e-06, + "loss": 0.82870996, + "num_input_tokens_seen": 82192600, + "step": 3820, + "time_per_iteration": 2.7951412200927734 + }, + { + "auxiliary_loss_clip": 0.01134627, + "auxiliary_loss_mlp": 0.01084867, + "balance_loss_clip": 1.03062797, + "balance_loss_mlp": 1.004282, + "epoch": 0.45944808513196655, + "flos": 21358517961600.0, + "grad_norm": 1.717160312276484, + "language_loss": 0.77965915, + "learning_rate": 2.357945906888586e-06, + "loss": 0.80185413, + "num_input_tokens_seen": 82212040, + "step": 3821, + "time_per_iteration": 2.678950786590576 + }, + { + "auxiliary_loss_clip": 0.01131965, + "auxiliary_loss_mlp": 0.01086598, + "balance_loss_clip": 1.02857721, + "balance_loss_mlp": 1.0058217, + "epoch": 0.45956832802260567, + "flos": 21427789340160.0, + "grad_norm": 2.2960155450155035, + "language_loss": 0.79507804, + "learning_rate": 2.357179485391324e-06, + "loss": 0.81726366, + "num_input_tokens_seen": 82229895, + "step": 3822, + "time_per_iteration": 2.717885971069336 + }, + { + "auxiliary_loss_clip": 0.01144532, + "auxiliary_loss_mlp": 0.01085147, + "balance_loss_clip": 1.03195846, + "balance_loss_mlp": 1.00470471, + "epoch": 0.4596885709132448, + "flos": 22382654538240.0, + "grad_norm": 1.8550790452950594, + "language_loss": 0.86527944, + "learning_rate": 2.3564130097102173e-06, + "loss": 0.88757622, + "num_input_tokens_seen": 82249550, + "step": 3823, + "time_per_iteration": 2.625875234603882 + }, + { + "auxiliary_loss_clip": 0.01122475, + "auxiliary_loss_mlp": 0.01086538, + "balance_loss_clip": 1.02896357, + "balance_loss_mlp": 1.00595236, + "epoch": 0.45980881380388383, + "flos": 28981957806720.0, + "grad_norm": 1.5816646406331267, + "language_loss": 0.74891233, + "learning_rate": 2.355646479961541e-06, + "loss": 0.77100241, + "num_input_tokens_seen": 82268860, + "step": 3824, + "time_per_iteration": 2.81538724899292 + }, + { + "auxiliary_loss_clip": 0.0114364, + "auxiliary_loss_mlp": 0.01085627, + "balance_loss_clip": 1.03108799, + "balance_loss_mlp": 1.00499415, + "epoch": 0.45992905669452294, + "flos": 33396599980800.0, + "grad_norm": 1.9599183288016107, + "language_loss": 0.71490943, + "learning_rate": 2.354879896261576e-06, + "loss": 0.73720211, + "num_input_tokens_seen": 82289070, + "step": 3825, + "time_per_iteration": 3.7341411113739014 + }, + { + "auxiliary_loss_clip": 0.0110931, + "auxiliary_loss_mlp": 0.01086126, + "balance_loss_clip": 1.02594686, + "balance_loss_mlp": 1.00549293, + "epoch": 0.46004929958516205, + "flos": 36318184502400.0, + "grad_norm": 1.7994270155639986, + "language_loss": 0.56832457, + "learning_rate": 2.3541132587266133e-06, + "loss": 0.59027892, + "num_input_tokens_seen": 82311790, + "step": 3826, + "time_per_iteration": 2.881159543991089 + }, + { + "auxiliary_loss_clip": 0.01117542, + "auxiliary_loss_mlp": 0.01084575, + "balance_loss_clip": 1.03019059, + "balance_loss_mlp": 1.00384617, + "epoch": 0.4601695424758011, + "flos": 17238451224960.0, + "grad_norm": 1.7344527628169528, + "language_loss": 0.68884176, + "learning_rate": 2.3533465674729515e-06, + "loss": 0.71086293, + "num_input_tokens_seen": 82329020, + "step": 3827, + "time_per_iteration": 2.8532755374908447 + }, + { + "auxiliary_loss_clip": 0.01143927, + "auxiliary_loss_mlp": 0.01086491, + "balance_loss_clip": 1.03143287, + "balance_loss_mlp": 1.00581002, + "epoch": 0.4602897853664402, + "flos": 15888425529600.0, + "grad_norm": 2.321181100338117, + "language_loss": 0.72572333, + "learning_rate": 2.352579822616895e-06, + "loss": 0.7480275, + "num_input_tokens_seen": 82346455, + "step": 3828, + "time_per_iteration": 2.6343741416931152 + }, + { + "auxiliary_loss_clip": 0.01110895, + "auxiliary_loss_mlp": 0.01085356, + "balance_loss_clip": 1.03217983, + "balance_loss_mlp": 1.0047226, + "epoch": 0.4604100282570793, + "flos": 25412617370880.0, + "grad_norm": 1.9920853533557776, + "language_loss": 0.78003103, + "learning_rate": 2.351813024274761e-06, + "loss": 0.80199355, + "num_input_tokens_seen": 82367810, + "step": 3829, + "time_per_iteration": 2.8261470794677734 + }, + { + "auxiliary_loss_clip": 0.01117877, + "auxiliary_loss_mlp": 0.01084706, + "balance_loss_clip": 1.03087378, + "balance_loss_mlp": 1.0041678, + "epoch": 0.4605302711477184, + "flos": 27630711048960.0, + "grad_norm": 1.6759011382424698, + "language_loss": 0.73741442, + "learning_rate": 2.3510461725628693e-06, + "loss": 0.7594403, + "num_input_tokens_seen": 82388275, + "step": 3830, + "time_per_iteration": 3.784926414489746 + }, + { + "auxiliary_loss_clip": 0.01114096, + "auxiliary_loss_mlp": 0.0108638, + "balance_loss_clip": 1.02831221, + "balance_loss_mlp": 1.0057466, + "epoch": 0.4606505140383575, + "flos": 23839657914240.0, + "grad_norm": 1.9234950419390877, + "language_loss": 0.70936453, + "learning_rate": 2.350279267597554e-06, + "loss": 0.73136926, + "num_input_tokens_seen": 82408915, + "step": 3831, + "time_per_iteration": 2.7796599864959717 + }, + { + "auxiliary_loss_clip": 0.01136107, + "auxiliary_loss_mlp": 0.01085953, + "balance_loss_clip": 1.03275633, + "balance_loss_mlp": 1.00512886, + "epoch": 0.46077075692899655, + "flos": 16107013745280.0, + "grad_norm": 2.1103296697505516, + "language_loss": 0.82834601, + "learning_rate": 2.3495123094951515e-06, + "loss": 0.85056657, + "num_input_tokens_seen": 82427260, + "step": 3832, + "time_per_iteration": 3.6238436698913574 + }, + { + "auxiliary_loss_clip": 0.01127175, + "auxiliary_loss_mlp": 0.01084424, + "balance_loss_clip": 1.03181624, + "balance_loss_mlp": 1.00374329, + "epoch": 0.46089099981963566, + "flos": 48798147634560.0, + "grad_norm": 2.087238894232564, + "language_loss": 0.75763655, + "learning_rate": 2.34874529837201e-06, + "loss": 0.77975249, + "num_input_tokens_seen": 82450805, + "step": 3833, + "time_per_iteration": 3.03385853767395 + }, + { + "auxiliary_loss_clip": 0.01096521, + "auxiliary_loss_mlp": 0.01084755, + "balance_loss_clip": 1.02706826, + "balance_loss_mlp": 1.00412178, + "epoch": 0.46101124271027477, + "flos": 19099234362240.0, + "grad_norm": 1.8149576916465737, + "language_loss": 0.78843904, + "learning_rate": 2.347978234344483e-06, + "loss": 0.81025183, + "num_input_tokens_seen": 82467010, + "step": 3834, + "time_per_iteration": 2.809267044067383 + }, + { + "auxiliary_loss_clip": 0.01136541, + "auxiliary_loss_mlp": 0.01086484, + "balance_loss_clip": 1.03230166, + "balance_loss_mlp": 1.00566006, + "epoch": 0.4611314856009138, + "flos": 39347931853440.0, + "grad_norm": 1.9358206678002419, + "language_loss": 0.69584757, + "learning_rate": 2.347211117528935e-06, + "loss": 0.71807784, + "num_input_tokens_seen": 82489310, + "step": 3835, + "time_per_iteration": 2.8317904472351074 + }, + { + "auxiliary_loss_clip": 0.01118531, + "auxiliary_loss_mlp": 0.01085642, + "balance_loss_clip": 1.03123832, + "balance_loss_mlp": 1.00505674, + "epoch": 0.46125172849155294, + "flos": 20810772489600.0, + "grad_norm": 1.690559312571546, + "language_loss": 0.71830696, + "learning_rate": 2.3464439480417374e-06, + "loss": 0.7403487, + "num_input_tokens_seen": 82508830, + "step": 3836, + "time_per_iteration": 3.691917896270752 + }, + { + "auxiliary_loss_clip": 0.0113535, + "auxiliary_loss_mlp": 0.01085163, + "balance_loss_clip": 1.03138447, + "balance_loss_mlp": 1.00443447, + "epoch": 0.46137197138219205, + "flos": 17930808852480.0, + "grad_norm": 2.596071545755018, + "language_loss": 0.76996946, + "learning_rate": 2.3456767259992676e-06, + "loss": 0.79217458, + "num_input_tokens_seen": 82526475, + "step": 3837, + "time_per_iteration": 2.7069175243377686 + }, + { + "auxiliary_loss_clip": 0.01143316, + "auxiliary_loss_mlp": 0.00873198, + "balance_loss_clip": 1.03059256, + "balance_loss_mlp": 1.00009179, + "epoch": 0.4614922142728311, + "flos": 16836610798080.0, + "grad_norm": 1.9824174753794466, + "language_loss": 0.8902992, + "learning_rate": 2.3449094515179135e-06, + "loss": 0.91046435, + "num_input_tokens_seen": 82543935, + "step": 3838, + "time_per_iteration": 2.6496951580047607 + }, + { + "auxiliary_loss_clip": 0.01127411, + "auxiliary_loss_mlp": 0.01084842, + "balance_loss_clip": 1.03069019, + "balance_loss_mlp": 1.0043999, + "epoch": 0.4616124571634702, + "flos": 26614906427520.0, + "grad_norm": 1.5241013990090666, + "language_loss": 0.81599319, + "learning_rate": 2.34414212471407e-06, + "loss": 0.83811581, + "num_input_tokens_seen": 82563730, + "step": 3839, + "time_per_iteration": 2.789855718612671 + }, + { + "auxiliary_loss_clip": 0.01135983, + "auxiliary_loss_mlp": 0.01087057, + "balance_loss_clip": 1.03086889, + "balance_loss_mlp": 1.00632858, + "epoch": 0.4617327000541093, + "flos": 20340127560960.0, + "grad_norm": 2.087717352470033, + "language_loss": 0.73026097, + "learning_rate": 2.3433747457041394e-06, + "loss": 0.75249135, + "num_input_tokens_seen": 82582435, + "step": 3840, + "time_per_iteration": 2.7312679290771484 + }, + { + "auxiliary_loss_clip": 0.01110276, + "auxiliary_loss_mlp": 0.0108567, + "balance_loss_clip": 1.02650952, + "balance_loss_mlp": 1.00498974, + "epoch": 0.4618529429447484, + "flos": 29570749545600.0, + "grad_norm": 1.9515552908995144, + "language_loss": 0.84720236, + "learning_rate": 2.342607314604533e-06, + "loss": 0.86916178, + "num_input_tokens_seen": 82602185, + "step": 3841, + "time_per_iteration": 2.8760831356048584 + }, + { + "auxiliary_loss_clip": 0.01130349, + "auxiliary_loss_mlp": 0.01085869, + "balance_loss_clip": 1.02871192, + "balance_loss_mlp": 1.0051403, + "epoch": 0.4619731858353875, + "flos": 19787030962560.0, + "grad_norm": 1.5724215862008792, + "language_loss": 0.84128594, + "learning_rate": 2.3418398315316694e-06, + "loss": 0.86344808, + "num_input_tokens_seen": 82620005, + "step": 3842, + "time_per_iteration": 2.686746835708618 + }, + { + "auxiliary_loss_clip": 0.01145092, + "auxiliary_loss_mlp": 0.01085961, + "balance_loss_clip": 1.03263581, + "balance_loss_mlp": 1.0052321, + "epoch": 0.4620934287260266, + "flos": 18951138587520.0, + "grad_norm": 2.4536673366424937, + "language_loss": 0.78275621, + "learning_rate": 2.3410722966019755e-06, + "loss": 0.80506682, + "num_input_tokens_seen": 82635120, + "step": 3843, + "time_per_iteration": 2.5828142166137695 + }, + { + "auxiliary_loss_clip": 0.01133364, + "auxiliary_loss_mlp": 0.01083533, + "balance_loss_clip": 1.03008771, + "balance_loss_mlp": 1.00294781, + "epoch": 0.46221367161666566, + "flos": 37341674634240.0, + "grad_norm": 1.6903128758938513, + "language_loss": 0.65341401, + "learning_rate": 2.3403047099318848e-06, + "loss": 0.675583, + "num_input_tokens_seen": 82659190, + "step": 3844, + "time_per_iteration": 2.8732750415802 + }, + { + "auxiliary_loss_clip": 0.0110703, + "auxiliary_loss_mlp": 0.01085045, + "balance_loss_clip": 1.02901649, + "balance_loss_mlp": 1.00431657, + "epoch": 0.46233391450730477, + "flos": 14428549065600.0, + "grad_norm": 2.6246904535668403, + "language_loss": 0.75410199, + "learning_rate": 2.3395370716378405e-06, + "loss": 0.77602273, + "num_input_tokens_seen": 82676635, + "step": 3845, + "time_per_iteration": 2.7589128017425537 + }, + { + "auxiliary_loss_clip": 0.01135407, + "auxiliary_loss_mlp": 0.01085702, + "balance_loss_clip": 1.03076982, + "balance_loss_mlp": 1.00506914, + "epoch": 0.4624541573979438, + "flos": 22493044010880.0, + "grad_norm": 2.2353050849476377, + "language_loss": 0.7226795, + "learning_rate": 2.338769381836292e-06, + "loss": 0.74489057, + "num_input_tokens_seen": 82696245, + "step": 3846, + "time_per_iteration": 2.7320923805236816 + }, + { + "auxiliary_loss_clip": 0.0111111, + "auxiliary_loss_mlp": 0.01086562, + "balance_loss_clip": 1.0268476, + "balance_loss_mlp": 1.00592875, + "epoch": 0.46257440028858293, + "flos": 14465070218880.0, + "grad_norm": 1.7828931174850537, + "language_loss": 0.7330687, + "learning_rate": 2.3380016406436984e-06, + "loss": 0.75504541, + "num_input_tokens_seen": 82713725, + "step": 3847, + "time_per_iteration": 2.769620656967163 + }, + { + "auxiliary_loss_clip": 0.01101984, + "auxiliary_loss_mlp": 0.01086388, + "balance_loss_clip": 1.0264169, + "balance_loss_mlp": 1.00565934, + "epoch": 0.46269464317922204, + "flos": 23332204523520.0, + "grad_norm": 1.808515783536446, + "language_loss": 0.81380081, + "learning_rate": 2.337233848176524e-06, + "loss": 0.83568454, + "num_input_tokens_seen": 82731495, + "step": 3848, + "time_per_iteration": 2.772723913192749 + }, + { + "auxiliary_loss_clip": 0.01105587, + "auxiliary_loss_mlp": 0.01084916, + "balance_loss_clip": 1.02831435, + "balance_loss_mlp": 1.00413966, + "epoch": 0.4628148860698611, + "flos": 18552027594240.0, + "grad_norm": 1.8633179338695711, + "language_loss": 0.83539021, + "learning_rate": 2.3364660045512435e-06, + "loss": 0.85729527, + "num_input_tokens_seen": 82750255, + "step": 3849, + "time_per_iteration": 2.801933526992798 + }, + { + "auxiliary_loss_clip": 0.01118443, + "auxiliary_loss_mlp": 0.01079588, + "balance_loss_clip": 1.0387938, + "balance_loss_mlp": 1.00024188, + "epoch": 0.4629351289605002, + "flos": 70667569670400.0, + "grad_norm": 0.7409392591603542, + "language_loss": 0.58251524, + "learning_rate": 2.335698109884337e-06, + "loss": 0.60449553, + "num_input_tokens_seen": 82815460, + "step": 3850, + "time_per_iteration": 4.325575351715088 + }, + { + "auxiliary_loss_clip": 0.01095542, + "auxiliary_loss_mlp": 0.01080554, + "balance_loss_clip": 1.03122759, + "balance_loss_mlp": 1.00120831, + "epoch": 0.4630553718511393, + "flos": 59687200465920.0, + "grad_norm": 0.7895214524054247, + "language_loss": 0.59876502, + "learning_rate": 2.334930164292294e-06, + "loss": 0.62052596, + "num_input_tokens_seen": 82878010, + "step": 3851, + "time_per_iteration": 3.476506471633911 + }, + { + "auxiliary_loss_clip": 0.01109572, + "auxiliary_loss_mlp": 0.01086544, + "balance_loss_clip": 1.03011727, + "balance_loss_mlp": 1.00591075, + "epoch": 0.4631756147417784, + "flos": 15960605909760.0, + "grad_norm": 1.9245667299043718, + "language_loss": 0.79542768, + "learning_rate": 2.334162167891612e-06, + "loss": 0.81738889, + "num_input_tokens_seen": 82895275, + "step": 3852, + "time_per_iteration": 2.9047508239746094 + }, + { + "auxiliary_loss_clip": 0.011281, + "auxiliary_loss_mlp": 0.01085925, + "balance_loss_clip": 1.03102648, + "balance_loss_mlp": 1.0051012, + "epoch": 0.4632958576324175, + "flos": 16472907636480.0, + "grad_norm": 2.4385268350151965, + "language_loss": 0.7495507, + "learning_rate": 2.333394120798795e-06, + "loss": 0.77169096, + "num_input_tokens_seen": 82914010, + "step": 3853, + "time_per_iteration": 2.737529754638672 + }, + { + "auxiliary_loss_clip": 0.01124819, + "auxiliary_loss_mlp": 0.0108566, + "balance_loss_clip": 1.02947712, + "balance_loss_mlp": 1.00493193, + "epoch": 0.4634161005230566, + "flos": 22346492520960.0, + "grad_norm": 2.1062126182628145, + "language_loss": 0.72121394, + "learning_rate": 2.3326260231303545e-06, + "loss": 0.74331868, + "num_input_tokens_seen": 82932610, + "step": 3854, + "time_per_iteration": 2.7504475116729736 + }, + { + "auxiliary_loss_clip": 0.01144112, + "auxiliary_loss_mlp": 0.01084728, + "balance_loss_clip": 1.03147972, + "balance_loss_mlp": 1.00419021, + "epoch": 0.46353634341369565, + "flos": 15742233175680.0, + "grad_norm": 1.5210482455018024, + "language_loss": 0.86546522, + "learning_rate": 2.331857875002811e-06, + "loss": 0.88775361, + "num_input_tokens_seen": 82951210, + "step": 3855, + "time_per_iteration": 3.6449036598205566 + }, + { + "auxiliary_loss_clip": 0.01125581, + "auxiliary_loss_mlp": 0.01086819, + "balance_loss_clip": 1.03111911, + "balance_loss_mlp": 1.00613785, + "epoch": 0.46365658630433476, + "flos": 28329820433280.0, + "grad_norm": 1.845444614664273, + "language_loss": 0.76139086, + "learning_rate": 2.3310896765326916e-06, + "loss": 0.78351486, + "num_input_tokens_seen": 82972210, + "step": 3856, + "time_per_iteration": 2.7407963275909424 + }, + { + "auxiliary_loss_clip": 0.01112404, + "auxiliary_loss_mlp": 0.01087094, + "balance_loss_clip": 1.02678204, + "balance_loss_mlp": 1.0064137, + "epoch": 0.46377682919497387, + "flos": 24608074590720.0, + "grad_norm": 1.5395996176966635, + "language_loss": 0.8404398, + "learning_rate": 2.330321427836531e-06, + "loss": 0.86243486, + "num_input_tokens_seen": 82994080, + "step": 3857, + "time_per_iteration": 3.701014757156372 + }, + { + "auxiliary_loss_clip": 0.01137435, + "auxiliary_loss_mlp": 0.01085432, + "balance_loss_clip": 1.03268659, + "balance_loss_mlp": 1.00479937, + "epoch": 0.4638970720856129, + "flos": 19060953442560.0, + "grad_norm": 1.780889930648488, + "language_loss": 0.82794678, + "learning_rate": 2.3295531290308733e-06, + "loss": 0.85017544, + "num_input_tokens_seen": 83012230, + "step": 3858, + "time_per_iteration": 2.6984386444091797 + }, + { + "auxiliary_loss_clip": 0.01145796, + "auxiliary_loss_mlp": 0.0087322, + "balance_loss_clip": 1.03310323, + "balance_loss_mlp": 1.00011277, + "epoch": 0.46401731497625204, + "flos": 18471012468480.0, + "grad_norm": 3.37192647495905, + "language_loss": 0.75754094, + "learning_rate": 2.3287847802322678e-06, + "loss": 0.77773106, + "num_input_tokens_seen": 83027800, + "step": 3859, + "time_per_iteration": 2.656977891921997 + }, + { + "auxiliary_loss_clip": 0.01108126, + "auxiliary_loss_mlp": 0.01085324, + "balance_loss_clip": 1.02979398, + "balance_loss_mlp": 1.00449991, + "epoch": 0.4641375578668911, + "flos": 26067053214720.0, + "grad_norm": 1.7643441104994202, + "language_loss": 0.83769512, + "learning_rate": 2.3280163815572723e-06, + "loss": 0.85962963, + "num_input_tokens_seen": 83048395, + "step": 3860, + "time_per_iteration": 2.790156841278076 + }, + { + "auxiliary_loss_clip": 0.01126467, + "auxiliary_loss_mlp": 0.0108474, + "balance_loss_clip": 1.03055596, + "balance_loss_mlp": 1.00405908, + "epoch": 0.4642578007575302, + "flos": 19570382081280.0, + "grad_norm": 2.073412651677749, + "language_loss": 0.7665416, + "learning_rate": 2.3272479331224522e-06, + "loss": 0.78865361, + "num_input_tokens_seen": 83065825, + "step": 3861, + "time_per_iteration": 3.65564227104187 + }, + { + "auxiliary_loss_clip": 0.01143727, + "auxiliary_loss_mlp": 0.01086804, + "balance_loss_clip": 1.03096509, + "balance_loss_mlp": 1.00607586, + "epoch": 0.4643780436481693, + "flos": 28186249772160.0, + "grad_norm": 1.7595949555888764, + "language_loss": 0.77887976, + "learning_rate": 2.3264794350443817e-06, + "loss": 0.80118513, + "num_input_tokens_seen": 83087920, + "step": 3862, + "time_per_iteration": 2.8243489265441895 + }, + { + "auxiliary_loss_clip": 0.01134975, + "auxiliary_loss_mlp": 0.01085669, + "balance_loss_clip": 1.0304265, + "balance_loss_mlp": 1.00494039, + "epoch": 0.46449828653880837, + "flos": 25375270204800.0, + "grad_norm": 3.3324916214095466, + "language_loss": 0.78243005, + "learning_rate": 2.3257108874396396e-06, + "loss": 0.8046366, + "num_input_tokens_seen": 83109015, + "step": 3863, + "time_per_iteration": 2.720991373062134 + }, + { + "auxiliary_loss_clip": 0.01126633, + "auxiliary_loss_mlp": 0.01087061, + "balance_loss_clip": 1.03062129, + "balance_loss_mlp": 1.00637996, + "epoch": 0.4646185294294475, + "flos": 16034330574720.0, + "grad_norm": 1.7883066046024274, + "language_loss": 0.73453927, + "learning_rate": 2.3249422904248152e-06, + "loss": 0.7566762, + "num_input_tokens_seen": 83127450, + "step": 3864, + "time_per_iteration": 2.789482831954956 + }, + { + "auxiliary_loss_clip": 0.01135369, + "auxiliary_loss_mlp": 0.01085582, + "balance_loss_clip": 1.03033137, + "balance_loss_mlp": 1.00494909, + "epoch": 0.4647387723200866, + "flos": 26363101109760.0, + "grad_norm": 1.3936529033086367, + "language_loss": 0.87083161, + "learning_rate": 2.324173644116504e-06, + "loss": 0.89304113, + "num_input_tokens_seen": 83150300, + "step": 3865, + "time_per_iteration": 2.7164855003356934 + }, + { + "auxiliary_loss_clip": 0.01133263, + "auxiliary_loss_mlp": 0.01085906, + "balance_loss_clip": 1.0310533, + "balance_loss_mlp": 1.00527334, + "epoch": 0.46485901521072565, + "flos": 27160209774720.0, + "grad_norm": 1.5998406331112014, + "language_loss": 0.81317168, + "learning_rate": 2.3234049486313087e-06, + "loss": 0.83536339, + "num_input_tokens_seen": 83171750, + "step": 3866, + "time_per_iteration": 2.7307169437408447 + }, + { + "auxiliary_loss_clip": 0.01135213, + "auxiliary_loss_mlp": 0.01085314, + "balance_loss_clip": 1.03122926, + "balance_loss_mlp": 1.00491893, + "epoch": 0.46497925810136476, + "flos": 24279851088000.0, + "grad_norm": 1.8116981552016138, + "language_loss": 0.75865346, + "learning_rate": 2.322636204085839e-06, + "loss": 0.78085876, + "num_input_tokens_seen": 83191820, + "step": 3867, + "time_per_iteration": 2.719398260116577 + }, + { + "auxiliary_loss_clip": 0.01127244, + "auxiliary_loss_mlp": 0.01086395, + "balance_loss_clip": 1.03070831, + "balance_loss_mlp": 1.0057615, + "epoch": 0.46509950099200387, + "flos": 16253134272000.0, + "grad_norm": 2.2647008160426725, + "language_loss": 0.79150259, + "learning_rate": 2.3218674105967143e-06, + "loss": 0.81363893, + "num_input_tokens_seen": 83210085, + "step": 3868, + "time_per_iteration": 2.716344118118286 + }, + { + "auxiliary_loss_clip": 0.0112343, + "auxiliary_loss_mlp": 0.01086186, + "balance_loss_clip": 1.02883935, + "balance_loss_mlp": 1.0054574, + "epoch": 0.4652197438826429, + "flos": 23442270773760.0, + "grad_norm": 1.5195117619578373, + "language_loss": 0.83691239, + "learning_rate": 2.3210985682805593e-06, + "loss": 0.85900855, + "num_input_tokens_seen": 83231865, + "step": 3869, + "time_per_iteration": 2.861849069595337 + }, + { + "auxiliary_loss_clip": 0.011448, + "auxiliary_loss_mlp": 0.01085121, + "balance_loss_clip": 1.03265107, + "balance_loss_mlp": 1.00444031, + "epoch": 0.46533998677328203, + "flos": 16216397637120.0, + "grad_norm": 2.183207737317757, + "language_loss": 0.67619979, + "learning_rate": 2.320329677254007e-06, + "loss": 0.69849902, + "num_input_tokens_seen": 83249195, + "step": 3870, + "time_per_iteration": 2.5976409912109375 + }, + { + "auxiliary_loss_clip": 0.01144666, + "auxiliary_loss_mlp": 0.01086126, + "balance_loss_clip": 1.03208375, + "balance_loss_mlp": 1.00549293, + "epoch": 0.46546022966392114, + "flos": 21141869080320.0, + "grad_norm": 1.9509081892452134, + "language_loss": 0.72366703, + "learning_rate": 2.319560737633697e-06, + "loss": 0.7459749, + "num_input_tokens_seen": 83267915, + "step": 3871, + "time_per_iteration": 2.6160032749176025 + }, + { + "auxiliary_loss_clip": 0.0111687, + "auxiliary_loss_mlp": 0.01086174, + "balance_loss_clip": 1.02868414, + "balance_loss_mlp": 1.00535035, + "epoch": 0.4655804725545602, + "flos": 41171942442240.0, + "grad_norm": 1.546737187259859, + "language_loss": 0.6799047, + "learning_rate": 2.3187917495362775e-06, + "loss": 0.70193517, + "num_input_tokens_seen": 83292325, + "step": 3872, + "time_per_iteration": 2.9016382694244385 + }, + { + "auxiliary_loss_clip": 0.01107328, + "auxiliary_loss_mlp": 0.01086261, + "balance_loss_clip": 1.02894175, + "balance_loss_mlp": 1.00572348, + "epoch": 0.4657007154451993, + "flos": 19570956698880.0, + "grad_norm": 2.7108426347433405, + "language_loss": 0.77196193, + "learning_rate": 2.318022713078403e-06, + "loss": 0.79389775, + "num_input_tokens_seen": 83306905, + "step": 3873, + "time_per_iteration": 2.824380397796631 + }, + { + "auxiliary_loss_clip": 0.01115908, + "auxiliary_loss_mlp": 0.01085683, + "balance_loss_clip": 1.02643025, + "balance_loss_mlp": 1.00504994, + "epoch": 0.4658209583358384, + "flos": 15517826956800.0, + "grad_norm": 2.419424299978035, + "language_loss": 0.85105306, + "learning_rate": 2.3172536283767354e-06, + "loss": 0.87306905, + "num_input_tokens_seen": 83320665, + "step": 3874, + "time_per_iteration": 2.7172975540161133 + }, + { + "auxiliary_loss_clip": 0.01115735, + "auxiliary_loss_mlp": 0.01086106, + "balance_loss_clip": 1.02986157, + "balance_loss_mlp": 1.00537777, + "epoch": 0.4659412012264775, + "flos": 14903180403840.0, + "grad_norm": 1.925137588487387, + "language_loss": 0.80958676, + "learning_rate": 2.3164844955479447e-06, + "loss": 0.8316052, + "num_input_tokens_seen": 83336475, + "step": 3875, + "time_per_iteration": 2.879009485244751 + }, + { + "auxiliary_loss_clip": 0.01094156, + "auxiliary_loss_mlp": 0.01085589, + "balance_loss_clip": 1.03093219, + "balance_loss_mlp": 1.00500333, + "epoch": 0.4660614441171166, + "flos": 24425612478720.0, + "grad_norm": 1.612245146980038, + "language_loss": 0.70372951, + "learning_rate": 2.3157153147087082e-06, + "loss": 0.72552693, + "num_input_tokens_seen": 83358365, + "step": 3876, + "time_per_iteration": 3.699374198913574 + }, + { + "auxiliary_loss_clip": 0.01090633, + "auxiliary_loss_mlp": 0.01086458, + "balance_loss_clip": 1.02891564, + "balance_loss_mlp": 1.00577688, + "epoch": 0.46618168700775564, + "flos": 22091095843200.0, + "grad_norm": 1.6650485983792727, + "language_loss": 0.83188194, + "learning_rate": 2.314946085975709e-06, + "loss": 0.85365283, + "num_input_tokens_seen": 83377345, + "step": 3877, + "time_per_iteration": 2.8278558254241943 + }, + { + "auxiliary_loss_clip": 0.01116108, + "auxiliary_loss_mlp": 0.01085651, + "balance_loss_clip": 1.03029919, + "balance_loss_mlp": 1.00496995, + "epoch": 0.46630192989839475, + "flos": 26176975810560.0, + "grad_norm": 1.7733019906411776, + "language_loss": 0.82328415, + "learning_rate": 2.3141768094656393e-06, + "loss": 0.84530175, + "num_input_tokens_seen": 83395920, + "step": 3878, + "time_per_iteration": 2.7719428539276123 + }, + { + "auxiliary_loss_clip": 0.01089258, + "auxiliary_loss_mlp": 0.01084614, + "balance_loss_clip": 1.02754259, + "balance_loss_mlp": 1.00407577, + "epoch": 0.46642217278903386, + "flos": 11509622150400.0, + "grad_norm": 2.543263453211043, + "language_loss": 0.83146369, + "learning_rate": 2.3134074852951966e-06, + "loss": 0.8532024, + "num_input_tokens_seen": 83412510, + "step": 3879, + "time_per_iteration": 2.843738555908203 + }, + { + "auxiliary_loss_clip": 0.01108578, + "auxiliary_loss_mlp": 0.01085875, + "balance_loss_clip": 1.0295372, + "balance_loss_mlp": 1.00524151, + "epoch": 0.4665424156796729, + "flos": 32306819299200.0, + "grad_norm": 1.693940649028537, + "language_loss": 0.77865195, + "learning_rate": 2.312638113581088e-06, + "loss": 0.80059648, + "num_input_tokens_seen": 83432995, + "step": 3880, + "time_per_iteration": 3.7313320636749268 + }, + { + "auxiliary_loss_clip": 0.01134495, + "auxiliary_loss_mlp": 0.01084446, + "balance_loss_clip": 1.02979815, + "balance_loss_mlp": 1.00371754, + "epoch": 0.46666265857031203, + "flos": 18436179254400.0, + "grad_norm": 2.6770368153896658, + "language_loss": 0.78344417, + "learning_rate": 2.311868694440027e-06, + "loss": 0.80563354, + "num_input_tokens_seen": 83447415, + "step": 3881, + "time_per_iteration": 2.658618211746216 + }, + { + "auxiliary_loss_clip": 0.01137689, + "auxiliary_loss_mlp": 0.01079648, + "balance_loss_clip": 1.04025865, + "balance_loss_mlp": 1.00030255, + "epoch": 0.46678290146095114, + "flos": 68438989221120.0, + "grad_norm": 0.7550311060604505, + "language_loss": 0.62487775, + "learning_rate": 2.3110992279887323e-06, + "loss": 0.64705116, + "num_input_tokens_seen": 83519340, + "step": 3882, + "time_per_iteration": 3.3217992782592773 + }, + { + "auxiliary_loss_clip": 0.01115406, + "auxiliary_loss_mlp": 0.01086028, + "balance_loss_clip": 1.02916193, + "balance_loss_mlp": 1.00520396, + "epoch": 0.4669031443515902, + "flos": 17712507945600.0, + "grad_norm": 2.218739363345708, + "language_loss": 0.84922385, + "learning_rate": 2.310329714343932e-06, + "loss": 0.87123823, + "num_input_tokens_seen": 83535490, + "step": 3883, + "time_per_iteration": 3.7860939502716064 + }, + { + "auxiliary_loss_clip": 0.01118462, + "auxiliary_loss_mlp": 0.010841, + "balance_loss_clip": 1.02842236, + "balance_loss_mlp": 1.00351429, + "epoch": 0.4670233872422293, + "flos": 23947748916480.0, + "grad_norm": 1.8771205156808815, + "language_loss": 0.81850708, + "learning_rate": 2.309560153622361e-06, + "loss": 0.84053266, + "num_input_tokens_seen": 83552400, + "step": 3884, + "time_per_iteration": 2.7817132472991943 + }, + { + "auxiliary_loss_clip": 0.01114035, + "auxiliary_loss_mlp": 0.01085554, + "balance_loss_clip": 1.02913308, + "balance_loss_mlp": 1.00477815, + "epoch": 0.4671436301328684, + "flos": 28111268131200.0, + "grad_norm": 1.8823595324562523, + "language_loss": 0.74085772, + "learning_rate": 2.3087905459407602e-06, + "loss": 0.76285362, + "num_input_tokens_seen": 83571340, + "step": 3885, + "time_per_iteration": 2.832637071609497 + }, + { + "auxiliary_loss_clip": 0.01113409, + "auxiliary_loss_mlp": 0.01079576, + "balance_loss_clip": 1.04088116, + "balance_loss_mlp": 1.00022984, + "epoch": 0.46726387302350747, + "flos": 69369684566400.0, + "grad_norm": 0.7986479055797393, + "language_loss": 0.62950695, + "learning_rate": 2.3080208914158795e-06, + "loss": 0.65143681, + "num_input_tokens_seen": 83634340, + "step": 3886, + "time_per_iteration": 3.3030831813812256 + }, + { + "auxiliary_loss_clip": 0.01115868, + "auxiliary_loss_mlp": 0.01084453, + "balance_loss_clip": 1.02762032, + "balance_loss_mlp": 1.00386727, + "epoch": 0.4673841159141466, + "flos": 25519666878720.0, + "grad_norm": 2.179559893302996, + "language_loss": 0.72445053, + "learning_rate": 2.3072511901644753e-06, + "loss": 0.74645376, + "num_input_tokens_seen": 83653410, + "step": 3887, + "time_per_iteration": 3.70963716506958 + }, + { + "auxiliary_loss_clip": 0.01143929, + "auxiliary_loss_mlp": 0.0108688, + "balance_loss_clip": 1.0313611, + "balance_loss_mlp": 1.00629485, + "epoch": 0.4675043588047857, + "flos": 24499265316480.0, + "grad_norm": 2.0238054930856615, + "language_loss": 0.81061149, + "learning_rate": 2.306481442303309e-06, + "loss": 0.8329196, + "num_input_tokens_seen": 83672985, + "step": 3888, + "time_per_iteration": 2.661612033843994 + }, + { + "auxiliary_loss_clip": 0.01132979, + "auxiliary_loss_mlp": 0.01086051, + "balance_loss_clip": 1.02878737, + "balance_loss_mlp": 1.00537062, + "epoch": 0.46762460169542475, + "flos": 20960771685120.0, + "grad_norm": 2.5510264894051033, + "language_loss": 0.73331594, + "learning_rate": 2.3057116479491515e-06, + "loss": 0.75550628, + "num_input_tokens_seen": 83692395, + "step": 3889, + "time_per_iteration": 2.760929822921753 + }, + { + "auxiliary_loss_clip": 0.01135242, + "auxiliary_loss_mlp": 0.01085007, + "balance_loss_clip": 1.03022194, + "balance_loss_mlp": 1.00432634, + "epoch": 0.46774484458606386, + "flos": 19171666137600.0, + "grad_norm": 2.485368462163816, + "language_loss": 0.75878525, + "learning_rate": 2.30494180721878e-06, + "loss": 0.78098774, + "num_input_tokens_seen": 83709735, + "step": 3890, + "time_per_iteration": 2.652904987335205 + }, + { + "auxiliary_loss_clip": 0.01134609, + "auxiliary_loss_mlp": 0.01085147, + "balance_loss_clip": 1.03025436, + "balance_loss_mlp": 1.00465703, + "epoch": 0.4678650874767029, + "flos": 17967689141760.0, + "grad_norm": 1.829924003921001, + "language_loss": 0.89669997, + "learning_rate": 2.3041719202289794e-06, + "loss": 0.91889751, + "num_input_tokens_seen": 83725910, + "step": 3891, + "time_per_iteration": 2.6486945152282715 + }, + { + "auxiliary_loss_clip": 0.01135093, + "auxiliary_loss_mlp": 0.01085949, + "balance_loss_clip": 1.03097963, + "balance_loss_mlp": 1.00555468, + "epoch": 0.467985330367342, + "flos": 21360816432000.0, + "grad_norm": 2.0512118416912615, + "language_loss": 0.80532372, + "learning_rate": 2.30340198709654e-06, + "loss": 0.8275342, + "num_input_tokens_seen": 83745745, + "step": 3892, + "time_per_iteration": 2.6758527755737305 + }, + { + "auxiliary_loss_clip": 0.01127233, + "auxiliary_loss_mlp": 0.01086957, + "balance_loss_clip": 1.03073776, + "balance_loss_mlp": 1.00632405, + "epoch": 0.46810557325798113, + "flos": 20521835487360.0, + "grad_norm": 2.0001329234347422, + "language_loss": 0.74289757, + "learning_rate": 2.3026320079382605e-06, + "loss": 0.76503944, + "num_input_tokens_seen": 83762680, + "step": 3893, + "time_per_iteration": 2.7421038150787354 + }, + { + "auxiliary_loss_clip": 0.01143694, + "auxiliary_loss_mlp": 0.01084777, + "balance_loss_clip": 1.03151596, + "balance_loss_mlp": 1.00428724, + "epoch": 0.4682258161486202, + "flos": 30117848572800.0, + "grad_norm": 7.847692219886711, + "language_loss": 0.76447147, + "learning_rate": 2.3018619828709454e-06, + "loss": 0.78675616, + "num_input_tokens_seen": 83784220, + "step": 3894, + "time_per_iteration": 2.7393858432769775 + }, + { + "auxiliary_loss_clip": 0.01125746, + "auxiliary_loss_mlp": 0.00873154, + "balance_loss_clip": 1.02851617, + "balance_loss_mlp": 1.00016737, + "epoch": 0.4683460590392593, + "flos": 25293357239040.0, + "grad_norm": 1.8211083480397277, + "language_loss": 0.82025313, + "learning_rate": 2.3010919120114084e-06, + "loss": 0.84024203, + "num_input_tokens_seen": 83800750, + "step": 3895, + "time_per_iteration": 2.7642765045166016 + }, + { + "auxiliary_loss_clip": 0.01134317, + "auxiliary_loss_mlp": 0.01085982, + "balance_loss_clip": 1.02982426, + "balance_loss_mlp": 1.00520635, + "epoch": 0.4684663019298984, + "flos": 15368330551680.0, + "grad_norm": 2.2375930799695825, + "language_loss": 0.65885615, + "learning_rate": 2.3003217954764672e-06, + "loss": 0.68105924, + "num_input_tokens_seen": 83815455, + "step": 3896, + "time_per_iteration": 2.6433210372924805 + }, + { + "auxiliary_loss_clip": 0.01134627, + "auxiliary_loss_mlp": 0.01085392, + "balance_loss_clip": 1.02993584, + "balance_loss_mlp": 1.00466371, + "epoch": 0.46858654482053747, + "flos": 27778842737280.0, + "grad_norm": 1.684875332310344, + "language_loss": 0.79312694, + "learning_rate": 2.299551633382949e-06, + "loss": 0.81532717, + "num_input_tokens_seen": 83835765, + "step": 3897, + "time_per_iteration": 2.73954701423645 + }, + { + "auxiliary_loss_clip": 0.01123176, + "auxiliary_loss_mlp": 0.01085043, + "balance_loss_clip": 1.0287199, + "balance_loss_mlp": 1.00441003, + "epoch": 0.4687067877111766, + "flos": 18040623707520.0, + "grad_norm": 2.2374285706073858, + "language_loss": 0.85600859, + "learning_rate": 2.2987814258476854e-06, + "loss": 0.8780908, + "num_input_tokens_seen": 83853565, + "step": 3898, + "time_per_iteration": 2.6832032203674316 + }, + { + "auxiliary_loss_clip": 0.01108553, + "auxiliary_loss_mlp": 0.01087037, + "balance_loss_clip": 1.02875018, + "balance_loss_mlp": 1.00626099, + "epoch": 0.4688270306018157, + "flos": 16977380198400.0, + "grad_norm": 3.949950123608892, + "language_loss": 0.68028319, + "learning_rate": 2.2980111729875177e-06, + "loss": 0.7022391, + "num_input_tokens_seen": 83869815, + "step": 3899, + "time_per_iteration": 2.831068277359009 + }, + { + "auxiliary_loss_clip": 0.01122457, + "auxiliary_loss_mlp": 0.01086525, + "balance_loss_clip": 1.02884245, + "balance_loss_mlp": 1.00584424, + "epoch": 0.46894727349245474, + "flos": 17821640442240.0, + "grad_norm": 1.716653808676285, + "language_loss": 0.82143986, + "learning_rate": 2.2972408749192917e-06, + "loss": 0.8435297, + "num_input_tokens_seen": 83887545, + "step": 3900, + "time_per_iteration": 2.7635786533355713 + }, + { + "auxiliary_loss_clip": 0.01132925, + "auxiliary_loss_mlp": 0.00873073, + "balance_loss_clip": 1.03071404, + "balance_loss_mlp": 1.00015843, + "epoch": 0.46906751638309385, + "flos": 21471349559040.0, + "grad_norm": 1.7691518620285926, + "language_loss": 0.66470683, + "learning_rate": 2.296470531759861e-06, + "loss": 0.68476677, + "num_input_tokens_seen": 83905645, + "step": 3901, + "time_per_iteration": 2.6898488998413086 + }, + { + "auxiliary_loss_clip": 0.01114016, + "auxiliary_loss_mlp": 0.01085297, + "balance_loss_clip": 1.02763271, + "balance_loss_mlp": 1.00461638, + "epoch": 0.46918775927373296, + "flos": 20337829090560.0, + "grad_norm": 1.9704594906661366, + "language_loss": 0.79367948, + "learning_rate": 2.2957001436260866e-06, + "loss": 0.81567258, + "num_input_tokens_seen": 83922705, + "step": 3902, + "time_per_iteration": 3.634495735168457 + }, + { + "auxiliary_loss_clip": 0.01127545, + "auxiliary_loss_mlp": 0.01086336, + "balance_loss_clip": 1.0312748, + "balance_loss_mlp": 1.00570297, + "epoch": 0.469308002164372, + "flos": 18403249461120.0, + "grad_norm": 1.5757089748031448, + "language_loss": 0.72952062, + "learning_rate": 2.294929710634836e-06, + "loss": 0.75165939, + "num_input_tokens_seen": 83940795, + "step": 3903, + "time_per_iteration": 2.8276236057281494 + }, + { + "auxiliary_loss_clip": 0.01134597, + "auxiliary_loss_mlp": 0.01084892, + "balance_loss_clip": 1.03017449, + "balance_loss_mlp": 1.00435448, + "epoch": 0.46942824505501113, + "flos": 37962067363200.0, + "grad_norm": 1.7236823862017296, + "language_loss": 0.61023927, + "learning_rate": 2.2941592329029823e-06, + "loss": 0.63243413, + "num_input_tokens_seen": 83961900, + "step": 3904, + "time_per_iteration": 2.82419490814209 + }, + { + "auxiliary_loss_clip": 0.01134739, + "auxiliary_loss_mlp": 0.01086111, + "balance_loss_clip": 1.03013492, + "balance_loss_mlp": 1.00542998, + "epoch": 0.46954848794565024, + "flos": 21872507627520.0, + "grad_norm": 1.704847131946235, + "language_loss": 0.79070145, + "learning_rate": 2.2933887105474067e-06, + "loss": 0.81290996, + "num_input_tokens_seen": 83980075, + "step": 3905, + "time_per_iteration": 3.5913400650024414 + }, + { + "auxiliary_loss_clip": 0.01132337, + "auxiliary_loss_mlp": 0.01084907, + "balance_loss_clip": 1.03018737, + "balance_loss_mlp": 1.00451243, + "epoch": 0.4696687308362893, + "flos": 22016545165440.0, + "grad_norm": 1.5371656653572656, + "language_loss": 0.81238902, + "learning_rate": 2.2926181436849974e-06, + "loss": 0.83456141, + "num_input_tokens_seen": 83999430, + "step": 3906, + "time_per_iteration": 2.7573554515838623 + }, + { + "auxiliary_loss_clip": 0.01133716, + "auxiliary_loss_mlp": 0.01085647, + "balance_loss_clip": 1.03068411, + "balance_loss_mlp": 1.00496626, + "epoch": 0.4697889737269284, + "flos": 21613663244160.0, + "grad_norm": 1.5591399416541054, + "language_loss": 0.72833842, + "learning_rate": 2.2918475324326478e-06, + "loss": 0.75053209, + "num_input_tokens_seen": 84019150, + "step": 3907, + "time_per_iteration": 2.724949359893799 + }, + { + "auxiliary_loss_clip": 0.01133054, + "auxiliary_loss_mlp": 0.0087318, + "balance_loss_clip": 1.02976227, + "balance_loss_mlp": 1.00018501, + "epoch": 0.46990921661756746, + "flos": 25228323665280.0, + "grad_norm": 2.327745825006987, + "language_loss": 0.91499603, + "learning_rate": 2.2910768769072603e-06, + "loss": 0.93505836, + "num_input_tokens_seen": 84037930, + "step": 3908, + "time_per_iteration": 3.753512144088745 + }, + { + "auxiliary_loss_clip": 0.01135916, + "auxiliary_loss_mlp": 0.01087513, + "balance_loss_clip": 1.03139043, + "balance_loss_mlp": 1.00692749, + "epoch": 0.47002945950820657, + "flos": 13844031045120.0, + "grad_norm": 11.66652949257523, + "language_loss": 0.75719857, + "learning_rate": 2.2903061772257417e-06, + "loss": 0.77943289, + "num_input_tokens_seen": 84055915, + "step": 3909, + "time_per_iteration": 2.644365072250366 + }, + { + "auxiliary_loss_clip": 0.0113306, + "auxiliary_loss_mlp": 0.01085786, + "balance_loss_clip": 1.03040981, + "balance_loss_mlp": 1.00515294, + "epoch": 0.4701497023988457, + "flos": 26247001374720.0, + "grad_norm": 1.465511940042844, + "language_loss": 0.78371787, + "learning_rate": 2.289535433505007e-06, + "loss": 0.80590636, + "num_input_tokens_seen": 84077270, + "step": 3910, + "time_per_iteration": 2.7675347328186035 + }, + { + "auxiliary_loss_clip": 0.01111185, + "auxiliary_loss_mlp": 0.01084263, + "balance_loss_clip": 1.03113985, + "balance_loss_mlp": 1.00377285, + "epoch": 0.47026994528948474, + "flos": 25629517647360.0, + "grad_norm": 1.8652218797455613, + "language_loss": 0.63733876, + "learning_rate": 2.2887646458619767e-06, + "loss": 0.65929329, + "num_input_tokens_seen": 84098635, + "step": 3911, + "time_per_iteration": 2.7347424030303955 + }, + { + "auxiliary_loss_clip": 0.0111604, + "auxiliary_loss_mlp": 0.01085579, + "balance_loss_clip": 1.03000379, + "balance_loss_mlp": 1.00489855, + "epoch": 0.47039018818012385, + "flos": 20554406144640.0, + "grad_norm": 2.0852728190077965, + "language_loss": 0.76306164, + "learning_rate": 2.2879938144135797e-06, + "loss": 0.78507781, + "num_input_tokens_seen": 84114740, + "step": 3912, + "time_per_iteration": 3.6464638710021973 + }, + { + "auxiliary_loss_clip": 0.01107358, + "auxiliary_loss_mlp": 0.00873051, + "balance_loss_clip": 1.0264647, + "balance_loss_mlp": 1.00019622, + "epoch": 0.47051043107076296, + "flos": 21577249831680.0, + "grad_norm": 1.5969879365745496, + "language_loss": 0.75317061, + "learning_rate": 2.2872229392767496e-06, + "loss": 0.77297467, + "num_input_tokens_seen": 84134845, + "step": 3913, + "time_per_iteration": 2.8516910076141357 + }, + { + "auxiliary_loss_clip": 0.01134773, + "auxiliary_loss_mlp": 0.01085223, + "balance_loss_clip": 1.03024006, + "balance_loss_mlp": 1.00463796, + "epoch": 0.470630673961402, + "flos": 18953185662720.0, + "grad_norm": 1.4420112278491972, + "language_loss": 0.74741375, + "learning_rate": 2.286452020568428e-06, + "loss": 0.76961374, + "num_input_tokens_seen": 84152920, + "step": 3914, + "time_per_iteration": 2.664451837539673 + }, + { + "auxiliary_loss_clip": 0.01144437, + "auxiliary_loss_mlp": 0.0108532, + "balance_loss_clip": 1.0310328, + "balance_loss_mlp": 1.00468671, + "epoch": 0.4707509168520411, + "flos": 19938969492480.0, + "grad_norm": 1.6598633375710392, + "language_loss": 0.72756582, + "learning_rate": 2.2856810584055637e-06, + "loss": 0.74986339, + "num_input_tokens_seen": 84170455, + "step": 3915, + "time_per_iteration": 2.832994222640991 + }, + { + "auxiliary_loss_clip": 0.01134026, + "auxiliary_loss_mlp": 0.01085125, + "balance_loss_clip": 1.03014016, + "balance_loss_mlp": 1.00449181, + "epoch": 0.47087115974268023, + "flos": 40118754741120.0, + "grad_norm": 1.4700748157330072, + "language_loss": 0.67794394, + "learning_rate": 2.2849100529051085e-06, + "loss": 0.70013547, + "num_input_tokens_seen": 84197390, + "step": 3916, + "time_per_iteration": 2.903879165649414 + }, + { + "auxiliary_loss_clip": 0.01144265, + "auxiliary_loss_mlp": 0.01085895, + "balance_loss_clip": 1.03169036, + "balance_loss_mlp": 1.00526226, + "epoch": 0.4709914026333193, + "flos": 13552723745280.0, + "grad_norm": 2.2914724531667767, + "language_loss": 0.79594231, + "learning_rate": 2.284139004184026e-06, + "loss": 0.81824392, + "num_input_tokens_seen": 84214620, + "step": 3917, + "time_per_iteration": 2.743997812271118 + }, + { + "auxiliary_loss_clip": 0.01144769, + "auxiliary_loss_mlp": 0.01084627, + "balance_loss_clip": 1.0321722, + "balance_loss_mlp": 1.00404119, + "epoch": 0.4711116455239584, + "flos": 19974628719360.0, + "grad_norm": 1.8406744416298417, + "language_loss": 0.74508584, + "learning_rate": 2.2833679123592814e-06, + "loss": 0.76737976, + "num_input_tokens_seen": 84231880, + "step": 3918, + "time_per_iteration": 2.641589879989624 + }, + { + "auxiliary_loss_clip": 0.01122698, + "auxiliary_loss_mlp": 0.01084558, + "balance_loss_clip": 1.02865231, + "balance_loss_mlp": 1.00397301, + "epoch": 0.4712318884145975, + "flos": 32124824064000.0, + "grad_norm": 1.6790091306386201, + "language_loss": 0.63509238, + "learning_rate": 2.2825967775478508e-06, + "loss": 0.65716493, + "num_input_tokens_seen": 84252980, + "step": 3919, + "time_per_iteration": 2.8126285076141357 + }, + { + "auxiliary_loss_clip": 0.0114317, + "auxiliary_loss_mlp": 0.01083821, + "balance_loss_clip": 1.03066528, + "balance_loss_mlp": 1.00323582, + "epoch": 0.47135213130523657, + "flos": 20047850593920.0, + "grad_norm": 1.877905665708856, + "language_loss": 0.83368802, + "learning_rate": 2.2818255998667135e-06, + "loss": 0.85595793, + "num_input_tokens_seen": 84271490, + "step": 3920, + "time_per_iteration": 2.6524345874786377 + }, + { + "auxiliary_loss_clip": 0.01134077, + "auxiliary_loss_mlp": 0.01085403, + "balance_loss_clip": 1.03186393, + "balance_loss_mlp": 1.00481749, + "epoch": 0.4714723741958757, + "flos": 19426990988160.0, + "grad_norm": 1.5095801844743382, + "language_loss": 0.79046643, + "learning_rate": 2.2810543794328566e-06, + "loss": 0.81266117, + "num_input_tokens_seen": 84290525, + "step": 3921, + "time_per_iteration": 2.6966817378997803 + }, + { + "auxiliary_loss_clip": 0.0113636, + "auxiliary_loss_mlp": 0.0108491, + "balance_loss_clip": 1.03191352, + "balance_loss_mlp": 1.00437188, + "epoch": 0.4715926170865148, + "flos": 20373883367040.0, + "grad_norm": 1.7089443824478388, + "language_loss": 0.82010531, + "learning_rate": 2.2802831163632735e-06, + "loss": 0.842318, + "num_input_tokens_seen": 84309245, + "step": 3922, + "time_per_iteration": 2.7230939865112305 + }, + { + "auxiliary_loss_clip": 0.01094495, + "auxiliary_loss_mlp": 0.01086174, + "balance_loss_clip": 1.02654457, + "balance_loss_mlp": 1.0054934, + "epoch": 0.47171285997715384, + "flos": 22672884430080.0, + "grad_norm": 1.7394503701946704, + "language_loss": 0.74418712, + "learning_rate": 2.279511810774965e-06, + "loss": 0.76599383, + "num_input_tokens_seen": 84330775, + "step": 3923, + "time_per_iteration": 2.8282968997955322 + }, + { + "auxiliary_loss_clip": 0.01143484, + "auxiliary_loss_mlp": 0.01084701, + "balance_loss_clip": 1.03096521, + "balance_loss_mlp": 1.00421119, + "epoch": 0.47183310286779295, + "flos": 21105419754240.0, + "grad_norm": 2.31680381846435, + "language_loss": 0.71505404, + "learning_rate": 2.2787404627849364e-06, + "loss": 0.7373358, + "num_input_tokens_seen": 84349985, + "step": 3924, + "time_per_iteration": 2.6399457454681396 + }, + { + "auxiliary_loss_clip": 0.01127361, + "auxiliary_loss_mlp": 0.0108522, + "balance_loss_clip": 1.03079784, + "balance_loss_mlp": 1.00472999, + "epoch": 0.471953345758432, + "flos": 21726566668800.0, + "grad_norm": 1.6444689090218807, + "language_loss": 0.79049492, + "learning_rate": 2.277969072510202e-06, + "loss": 0.81262064, + "num_input_tokens_seen": 84368965, + "step": 3925, + "time_per_iteration": 2.728041410446167 + }, + { + "auxiliary_loss_clip": 0.01124016, + "auxiliary_loss_mlp": 0.01085716, + "balance_loss_clip": 1.02952266, + "balance_loss_mlp": 1.0052259, + "epoch": 0.4720735886490711, + "flos": 19861078849920.0, + "grad_norm": 1.520095766682826, + "language_loss": 0.81607664, + "learning_rate": 2.2771976400677803e-06, + "loss": 0.83817399, + "num_input_tokens_seen": 84387795, + "step": 3926, + "time_per_iteration": 2.7318220138549805 + }, + { + "auxiliary_loss_clip": 0.01105588, + "auxiliary_loss_mlp": 0.01084378, + "balance_loss_clip": 1.02801716, + "balance_loss_mlp": 1.00388789, + "epoch": 0.47219383153971023, + "flos": 19171809792000.0, + "grad_norm": 1.764981568293088, + "language_loss": 0.78956312, + "learning_rate": 2.2764261655746965e-06, + "loss": 0.81146282, + "num_input_tokens_seen": 84405290, + "step": 3927, + "time_per_iteration": 3.6882026195526123 + }, + { + "auxiliary_loss_clip": 0.01114578, + "auxiliary_loss_mlp": 0.0108557, + "balance_loss_clip": 1.02786326, + "balance_loss_mlp": 1.00493646, + "epoch": 0.4723140744303493, + "flos": 23224005780480.0, + "grad_norm": 1.634265281368125, + "language_loss": 0.75739044, + "learning_rate": 2.2756546491479832e-06, + "loss": 0.77939188, + "num_input_tokens_seen": 84426205, + "step": 3928, + "time_per_iteration": 2.7668251991271973 + }, + { + "auxiliary_loss_clip": 0.01142963, + "auxiliary_loss_mlp": 0.0087315, + "balance_loss_clip": 1.03004611, + "balance_loss_mlp": 1.00015569, + "epoch": 0.4724343173209884, + "flos": 18223265387520.0, + "grad_norm": 2.4673095901758906, + "language_loss": 0.80298412, + "learning_rate": 2.274883090904679e-06, + "loss": 0.82314527, + "num_input_tokens_seen": 84443970, + "step": 3929, + "time_per_iteration": 2.5765066146850586 + }, + { + "auxiliary_loss_clip": 0.01145089, + "auxiliary_loss_mlp": 0.01086798, + "balance_loss_clip": 1.03246641, + "balance_loss_mlp": 1.00606966, + "epoch": 0.4725545602116275, + "flos": 21251037490560.0, + "grad_norm": 2.4089842011906573, + "language_loss": 0.67884016, + "learning_rate": 2.2741114909618283e-06, + "loss": 0.70115912, + "num_input_tokens_seen": 84459865, + "step": 3930, + "time_per_iteration": 3.55894136428833 + }, + { + "auxiliary_loss_clip": 0.01113934, + "auxiliary_loss_mlp": 0.0108395, + "balance_loss_clip": 1.02843618, + "balance_loss_mlp": 1.00345993, + "epoch": 0.47267480310226656, + "flos": 21434002392960.0, + "grad_norm": 2.62408834487033, + "language_loss": 0.71949887, + "learning_rate": 2.2733398494364828e-06, + "loss": 0.74147773, + "num_input_tokens_seen": 84479110, + "step": 3931, + "time_per_iteration": 2.839597702026367 + }, + { + "auxiliary_loss_clip": 0.01119159, + "auxiliary_loss_mlp": 0.01086389, + "balance_loss_clip": 1.02652669, + "balance_loss_mlp": 1.00580394, + "epoch": 0.47279504599290567, + "flos": 18770508069120.0, + "grad_norm": 1.9334326326119011, + "language_loss": 0.84710109, + "learning_rate": 2.272568166445699e-06, + "loss": 0.8691566, + "num_input_tokens_seen": 84497675, + "step": 3932, + "time_per_iteration": 2.699707269668579 + }, + { + "auxiliary_loss_clip": 0.0113332, + "auxiliary_loss_mlp": 0.01085223, + "balance_loss_clip": 1.0294199, + "balance_loss_mlp": 1.00458992, + "epoch": 0.4729152888835448, + "flos": 21105742976640.0, + "grad_norm": 1.9214564935745369, + "language_loss": 0.64785945, + "learning_rate": 2.271796442106541e-06, + "loss": 0.6700449, + "num_input_tokens_seen": 84517030, + "step": 3933, + "time_per_iteration": 2.6900460720062256 + }, + { + "auxiliary_loss_clip": 0.01109883, + "auxiliary_loss_mlp": 0.01079763, + "balance_loss_clip": 1.03725934, + "balance_loss_mlp": 1.00041771, + "epoch": 0.47303553177418384, + "flos": 70201877840640.0, + "grad_norm": 0.802857583568476, + "language_loss": 0.56491792, + "learning_rate": 2.271024676536079e-06, + "loss": 0.5868144, + "num_input_tokens_seen": 84577290, + "step": 3934, + "time_per_iteration": 4.24645209312439 + }, + { + "auxiliary_loss_clip": 0.01124984, + "auxiliary_loss_mlp": 0.01085941, + "balance_loss_clip": 1.0301137, + "balance_loss_mlp": 1.00521207, + "epoch": 0.47315577466482295, + "flos": 22455122227200.0, + "grad_norm": 1.7816393081276751, + "language_loss": 0.73433715, + "learning_rate": 2.2702528698513894e-06, + "loss": 0.75644636, + "num_input_tokens_seen": 84598415, + "step": 3935, + "time_per_iteration": 2.7330732345581055 + }, + { + "auxiliary_loss_clip": 0.01128341, + "auxiliary_loss_mlp": 0.01084261, + "balance_loss_clip": 1.03133655, + "balance_loss_mlp": 1.00367594, + "epoch": 0.47327601755546206, + "flos": 24352857480960.0, + "grad_norm": 1.811578364798649, + "language_loss": 0.78666627, + "learning_rate": 2.269481022169554e-06, + "loss": 0.80879223, + "num_input_tokens_seen": 84617010, + "step": 3936, + "time_per_iteration": 2.790842294692993 + }, + { + "auxiliary_loss_clip": 0.01125344, + "auxiliary_loss_mlp": 0.01085184, + "balance_loss_clip": 1.02944255, + "balance_loss_mlp": 1.00445509, + "epoch": 0.4733962604461011, + "flos": 22926772736640.0, + "grad_norm": 3.0913613448754638, + "language_loss": 0.8033337, + "learning_rate": 2.2687091336076614e-06, + "loss": 0.82543898, + "num_input_tokens_seen": 84636350, + "step": 3937, + "time_per_iteration": 3.6755428314208984 + }, + { + "auxiliary_loss_clip": 0.01133145, + "auxiliary_loss_mlp": 0.01084884, + "balance_loss_clip": 1.03017116, + "balance_loss_mlp": 1.00434566, + "epoch": 0.4735165033367402, + "flos": 18327369980160.0, + "grad_norm": 1.821089167634868, + "language_loss": 0.80211461, + "learning_rate": 2.267937204282807e-06, + "loss": 0.82429487, + "num_input_tokens_seen": 84653490, + "step": 3938, + "time_per_iteration": 2.7805817127227783 + }, + { + "auxiliary_loss_clip": 0.01135693, + "auxiliary_loss_mlp": 0.01086358, + "balance_loss_clip": 1.03148806, + "balance_loss_mlp": 1.00548673, + "epoch": 0.4736367462273793, + "flos": 23037018554880.0, + "grad_norm": 1.9066313133018962, + "language_loss": 0.78576922, + "learning_rate": 2.2671652343120926e-06, + "loss": 0.80798984, + "num_input_tokens_seen": 84673965, + "step": 3939, + "time_per_iteration": 2.749016523361206 + }, + { + "auxiliary_loss_clip": 0.01143892, + "auxiliary_loss_mlp": 0.01084722, + "balance_loss_clip": 1.03174067, + "balance_loss_mlp": 1.00418448, + "epoch": 0.4737569891180184, + "flos": 25374336451200.0, + "grad_norm": 2.0114158006764433, + "language_loss": 0.80595565, + "learning_rate": 2.2663932238126236e-06, + "loss": 0.82824183, + "num_input_tokens_seen": 84692525, + "step": 3940, + "time_per_iteration": 2.705077886581421 + }, + { + "auxiliary_loss_clip": 0.01134786, + "auxiliary_loss_mlp": 0.01086079, + "balance_loss_clip": 1.03040588, + "balance_loss_mlp": 1.00549364, + "epoch": 0.4738772320086575, + "flos": 25849326925440.0, + "grad_norm": 1.3951460505751214, + "language_loss": 0.80119526, + "learning_rate": 2.265621172901515e-06, + "loss": 0.82340384, + "num_input_tokens_seen": 84715640, + "step": 3941, + "time_per_iteration": 2.7352819442749023 + }, + { + "auxiliary_loss_clip": 0.01144665, + "auxiliary_loss_mlp": 0.01087676, + "balance_loss_clip": 1.03219235, + "balance_loss_mlp": 1.00699508, + "epoch": 0.47399747489929656, + "flos": 27564420499200.0, + "grad_norm": 1.9949305974741185, + "language_loss": 0.71594334, + "learning_rate": 2.2648490816958854e-06, + "loss": 0.73826677, + "num_input_tokens_seen": 84736635, + "step": 3942, + "time_per_iteration": 2.6728675365448 + }, + { + "auxiliary_loss_clip": 0.01135766, + "auxiliary_loss_mlp": 0.01085874, + "balance_loss_clip": 1.03124118, + "balance_loss_mlp": 1.00524068, + "epoch": 0.47411771778993567, + "flos": 24863650836480.0, + "grad_norm": 2.033054093408769, + "language_loss": 0.73271227, + "learning_rate": 2.264076950312861e-06, + "loss": 0.75492871, + "num_input_tokens_seen": 84755445, + "step": 3943, + "time_per_iteration": 2.802107810974121 + }, + { + "auxiliary_loss_clip": 0.01122604, + "auxiliary_loss_mlp": 0.01085533, + "balance_loss_clip": 1.02801013, + "balance_loss_mlp": 1.00480437, + "epoch": 0.4742379606805748, + "flos": 22748009725440.0, + "grad_norm": 2.501853195730132, + "language_loss": 0.82572001, + "learning_rate": 2.2633047788695727e-06, + "loss": 0.84780139, + "num_input_tokens_seen": 84775750, + "step": 3944, + "time_per_iteration": 2.7244603633880615 + }, + { + "auxiliary_loss_clip": 0.01122301, + "auxiliary_loss_mlp": 0.0108609, + "balance_loss_clip": 1.02855563, + "balance_loss_mlp": 1.00555217, + "epoch": 0.47435820357121383, + "flos": 19681130689920.0, + "grad_norm": 1.766010097049687, + "language_loss": 0.64312804, + "learning_rate": 2.262532567483159e-06, + "loss": 0.66521198, + "num_input_tokens_seen": 84794310, + "step": 3945, + "time_per_iteration": 2.838390588760376 + }, + { + "auxiliary_loss_clip": 0.01144634, + "auxiliary_loss_mlp": 0.00873129, + "balance_loss_clip": 1.03205562, + "balance_loss_mlp": 1.00021434, + "epoch": 0.47447844646185294, + "flos": 25228718714880.0, + "grad_norm": 1.8752398077088428, + "language_loss": 0.79869187, + "learning_rate": 2.2617603162707635e-06, + "loss": 0.81886947, + "num_input_tokens_seen": 84814720, + "step": 3946, + "time_per_iteration": 2.742544651031494 + }, + { + "auxiliary_loss_clip": 0.01143529, + "auxiliary_loss_mlp": 0.01084731, + "balance_loss_clip": 1.03155041, + "balance_loss_mlp": 1.00414586, + "epoch": 0.47459868935249205, + "flos": 24570619683840.0, + "grad_norm": 1.704374690947162, + "language_loss": 0.82747948, + "learning_rate": 2.2609880253495363e-06, + "loss": 0.84976208, + "num_input_tokens_seen": 84834355, + "step": 3947, + "time_per_iteration": 2.646653175354004 + }, + { + "auxiliary_loss_clip": 0.01103102, + "auxiliary_loss_mlp": 0.01085085, + "balance_loss_clip": 1.03086233, + "balance_loss_mlp": 1.00440383, + "epoch": 0.4747189322431311, + "flos": 20558500295040.0, + "grad_norm": 3.0884496189704036, + "language_loss": 0.86682808, + "learning_rate": 2.260215694836633e-06, + "loss": 0.88870996, + "num_input_tokens_seen": 84853530, + "step": 3948, + "time_per_iteration": 2.8112921714782715 + }, + { + "auxiliary_loss_clip": 0.01107913, + "auxiliary_loss_mlp": 0.00873145, + "balance_loss_clip": 1.02912319, + "balance_loss_mlp": 1.00023055, + "epoch": 0.4748391751337702, + "flos": 25995231970560.0, + "grad_norm": 1.6834810951068042, + "language_loss": 0.6482172, + "learning_rate": 2.2594433248492157e-06, + "loss": 0.66802776, + "num_input_tokens_seen": 84872505, + "step": 3949, + "time_per_iteration": 2.861537456512451 + }, + { + "auxiliary_loss_clip": 0.0113475, + "auxiliary_loss_mlp": 0.01086693, + "balance_loss_clip": 1.02988791, + "balance_loss_mlp": 1.00601196, + "epoch": 0.47495941802440933, + "flos": 22821052032000.0, + "grad_norm": 1.605404648659811, + "language_loss": 0.80059272, + "learning_rate": 2.2586709155044527e-06, + "loss": 0.82280713, + "num_input_tokens_seen": 84893105, + "step": 3950, + "time_per_iteration": 2.786391019821167 + }, + { + "auxiliary_loss_clip": 0.0114339, + "auxiliary_loss_mlp": 0.01085913, + "balance_loss_clip": 1.0311203, + "balance_loss_mlp": 1.00527954, + "epoch": 0.4750796609150484, + "flos": 27891782075520.0, + "grad_norm": 1.5165554385868514, + "language_loss": 0.75910133, + "learning_rate": 2.2578984669195167e-06, + "loss": 0.78139436, + "num_input_tokens_seen": 84914070, + "step": 3951, + "time_per_iteration": 2.757028579711914 + }, + { + "auxiliary_loss_clip": 0.01133923, + "auxiliary_loss_mlp": 0.01084767, + "balance_loss_clip": 1.02977312, + "balance_loss_mlp": 1.00427651, + "epoch": 0.4751999038056875, + "flos": 35660085471360.0, + "grad_norm": 1.8337064797219786, + "language_loss": 0.68123949, + "learning_rate": 2.2571259792115887e-06, + "loss": 0.70342636, + "num_input_tokens_seen": 84935290, + "step": 3952, + "time_per_iteration": 2.830681324005127 + }, + { + "auxiliary_loss_clip": 0.01135373, + "auxiliary_loss_mlp": 0.01085038, + "balance_loss_clip": 1.03145337, + "balance_loss_mlp": 1.00454807, + "epoch": 0.4753201466963266, + "flos": 22090880361600.0, + "grad_norm": 1.6958088397215951, + "language_loss": 0.79541385, + "learning_rate": 2.2563534524978544e-06, + "loss": 0.81761795, + "num_input_tokens_seen": 84952760, + "step": 3953, + "time_per_iteration": 3.6624693870544434 + }, + { + "auxiliary_loss_clip": 0.01109662, + "auxiliary_loss_mlp": 0.01085124, + "balance_loss_clip": 1.02646589, + "balance_loss_mlp": 1.00463343, + "epoch": 0.47544038958696566, + "flos": 30190854965760.0, + "grad_norm": 2.0114934150280828, + "language_loss": 0.70548862, + "learning_rate": 2.2555808868955052e-06, + "loss": 0.72743654, + "num_input_tokens_seen": 84974890, + "step": 3954, + "time_per_iteration": 2.8499667644500732 + }, + { + "auxiliary_loss_clip": 0.0109411, + "auxiliary_loss_mlp": 0.01086338, + "balance_loss_clip": 1.02350414, + "balance_loss_mlp": 1.00565696, + "epoch": 0.47556063247760477, + "flos": 23472219738240.0, + "grad_norm": 2.195039267047856, + "language_loss": 0.7351445, + "learning_rate": 2.254808282521738e-06, + "loss": 0.75694895, + "num_input_tokens_seen": 84993640, + "step": 3955, + "time_per_iteration": 2.8544321060180664 + }, + { + "auxiliary_loss_clip": 0.0111609, + "auxiliary_loss_mlp": 0.00873197, + "balance_loss_clip": 1.02921176, + "balance_loss_mlp": 1.00029457, + "epoch": 0.4756808753682438, + "flos": 25155209531520.0, + "grad_norm": 1.7391231250649317, + "language_loss": 0.80860782, + "learning_rate": 2.2540356394937573e-06, + "loss": 0.82850069, + "num_input_tokens_seen": 85012340, + "step": 3956, + "time_per_iteration": 3.735306978225708 + }, + { + "auxiliary_loss_clip": 0.01114404, + "auxiliary_loss_mlp": 0.01086444, + "balance_loss_clip": 1.02818227, + "balance_loss_mlp": 1.00571513, + "epoch": 0.47580111825888294, + "flos": 15669729573120.0, + "grad_norm": 2.1095465918756786, + "language_loss": 0.84319562, + "learning_rate": 2.253262957928772e-06, + "loss": 0.8652041, + "num_input_tokens_seen": 85029225, + "step": 3957, + "time_per_iteration": 2.770679473876953 + }, + { + "auxiliary_loss_clip": 0.01125657, + "auxiliary_loss_mlp": 0.01086874, + "balance_loss_clip": 1.02926993, + "balance_loss_mlp": 1.00619292, + "epoch": 0.47592136114952205, + "flos": 17636556637440.0, + "grad_norm": 1.702561644653109, + "language_loss": 0.72107524, + "learning_rate": 2.2524902379439976e-06, + "loss": 0.74320054, + "num_input_tokens_seen": 85047895, + "step": 3958, + "time_per_iteration": 2.769648313522339 + }, + { + "auxiliary_loss_clip": 0.01084917, + "auxiliary_loss_mlp": 0.01080121, + "balance_loss_clip": 1.02935255, + "balance_loss_mlp": 1.00077569, + "epoch": 0.4760416040401611, + "flos": 61417159292160.0, + "grad_norm": 0.7415427975525463, + "language_loss": 0.63691044, + "learning_rate": 2.251717479656655e-06, + "loss": 0.65856081, + "num_input_tokens_seen": 85112690, + "step": 3959, + "time_per_iteration": 4.392689228057861 + }, + { + "auxiliary_loss_clip": 0.0114302, + "auxiliary_loss_mlp": 0.0108742, + "balance_loss_clip": 1.03046823, + "balance_loss_mlp": 1.00669122, + "epoch": 0.4761618469308002, + "flos": 18405871153920.0, + "grad_norm": 1.6833122409518773, + "language_loss": 0.76259267, + "learning_rate": 2.2509446831839704e-06, + "loss": 0.78489709, + "num_input_tokens_seen": 85132130, + "step": 3960, + "time_per_iteration": 2.708158254623413 + }, + { + "auxiliary_loss_clip": 0.01123704, + "auxiliary_loss_mlp": 0.01086976, + "balance_loss_clip": 1.0287112, + "balance_loss_mlp": 1.00624728, + "epoch": 0.4762820898214393, + "flos": 18040911016320.0, + "grad_norm": 2.0319391810174996, + "language_loss": 0.81886375, + "learning_rate": 2.250171848643177e-06, + "loss": 0.84097052, + "num_input_tokens_seen": 85149420, + "step": 3961, + "time_per_iteration": 2.661742687225342 + }, + { + "auxiliary_loss_clip": 0.011233, + "auxiliary_loss_mlp": 0.01084986, + "balance_loss_clip": 1.02946365, + "balance_loss_mlp": 1.00449562, + "epoch": 0.4764023327120784, + "flos": 19318253541120.0, + "grad_norm": 1.88026967496055, + "language_loss": 0.85689437, + "learning_rate": 2.249398976151513e-06, + "loss": 0.87897724, + "num_input_tokens_seen": 85166970, + "step": 3962, + "time_per_iteration": 3.649319887161255 + }, + { + "auxiliary_loss_clip": 0.01143341, + "auxiliary_loss_mlp": 0.01086896, + "balance_loss_clip": 1.03130317, + "balance_loss_mlp": 1.00631022, + "epoch": 0.4765225756027175, + "flos": 22747255539840.0, + "grad_norm": 2.2040459114351587, + "language_loss": 0.78595579, + "learning_rate": 2.248626065826223e-06, + "loss": 0.80825812, + "num_input_tokens_seen": 85185175, + "step": 3963, + "time_per_iteration": 2.661618709564209 + }, + { + "auxiliary_loss_clip": 0.01136192, + "auxiliary_loss_mlp": 0.01079725, + "balance_loss_clip": 1.03891683, + "balance_loss_mlp": 1.00037885, + "epoch": 0.4766428184933566, + "flos": 65933392106880.0, + "grad_norm": 0.7604042761956064, + "language_loss": 0.62570751, + "learning_rate": 2.2478531177845564e-06, + "loss": 0.64786667, + "num_input_tokens_seen": 85246170, + "step": 3964, + "time_per_iteration": 3.182790756225586 + }, + { + "auxiliary_loss_clip": 0.01124043, + "auxiliary_loss_mlp": 0.01086828, + "balance_loss_clip": 1.02938294, + "balance_loss_mlp": 1.00624287, + "epoch": 0.47676306138399566, + "flos": 24136495908480.0, + "grad_norm": 1.6634093946500284, + "language_loss": 0.8518123, + "learning_rate": 2.247080132143769e-06, + "loss": 0.87392104, + "num_input_tokens_seen": 85268525, + "step": 3965, + "time_per_iteration": 2.777001142501831 + }, + { + "auxiliary_loss_clip": 0.01113809, + "auxiliary_loss_mlp": 0.01085433, + "balance_loss_clip": 1.02632451, + "balance_loss_mlp": 1.00479984, + "epoch": 0.47688330427463477, + "flos": 12604322995200.0, + "grad_norm": 3.8272155455943104, + "language_loss": 0.69212455, + "learning_rate": 2.246307109021121e-06, + "loss": 0.71411699, + "num_input_tokens_seen": 85285930, + "step": 3966, + "time_per_iteration": 2.6808061599731445 + }, + { + "auxiliary_loss_clip": 0.01126552, + "auxiliary_loss_mlp": 0.01086099, + "balance_loss_clip": 1.02977562, + "balance_loss_mlp": 1.00551319, + "epoch": 0.4770035471652739, + "flos": 21390585828480.0, + "grad_norm": 1.6040608283039284, + "language_loss": 0.82233608, + "learning_rate": 2.2455340485338817e-06, + "loss": 0.84446257, + "num_input_tokens_seen": 85303565, + "step": 3967, + "time_per_iteration": 2.726628541946411 + }, + { + "auxiliary_loss_clip": 0.01135084, + "auxiliary_loss_mlp": 0.0108479, + "balance_loss_clip": 1.03069425, + "balance_loss_mlp": 1.00425208, + "epoch": 0.47712379005591293, + "flos": 25156251025920.0, + "grad_norm": 2.215709228677907, + "language_loss": 0.678496, + "learning_rate": 2.244760950799322e-06, + "loss": 0.70069474, + "num_input_tokens_seen": 85321835, + "step": 3968, + "time_per_iteration": 2.7741780281066895 + }, + { + "auxiliary_loss_clip": 0.01114108, + "auxiliary_loss_mlp": 0.01085596, + "balance_loss_clip": 1.02965009, + "balance_loss_mlp": 1.00505853, + "epoch": 0.47724403294655204, + "flos": 22054323294720.0, + "grad_norm": 1.9869973429350312, + "language_loss": 0.72426379, + "learning_rate": 2.2439878159347203e-06, + "loss": 0.74626088, + "num_input_tokens_seen": 85341260, + "step": 3969, + "time_per_iteration": 2.771416664123535 + }, + { + "auxiliary_loss_clip": 0.01136362, + "auxiliary_loss_mlp": 0.01079347, + "balance_loss_clip": 1.03910828, + "balance_loss_mlp": 1.00000179, + "epoch": 0.4773642758371911, + "flos": 70229387658240.0, + "grad_norm": 0.7266405257317553, + "language_loss": 0.55277383, + "learning_rate": 2.2432146440573616e-06, + "loss": 0.57493091, + "num_input_tokens_seen": 85407220, + "step": 3970, + "time_per_iteration": 3.3329405784606934 + }, + { + "auxiliary_loss_clip": 0.0111004, + "auxiliary_loss_mlp": 0.01085887, + "balance_loss_clip": 1.03109419, + "balance_loss_mlp": 1.00525427, + "epoch": 0.4774845187278302, + "flos": 23548602009600.0, + "grad_norm": 1.8906047340804102, + "language_loss": 0.66327202, + "learning_rate": 2.242441435284534e-06, + "loss": 0.68523133, + "num_input_tokens_seen": 85426095, + "step": 3971, + "time_per_iteration": 2.781261920928955 + }, + { + "auxiliary_loss_clip": 0.0113263, + "auxiliary_loss_mlp": 0.01085462, + "balance_loss_clip": 1.02982509, + "balance_loss_mlp": 1.0047332, + "epoch": 0.4776047616184693, + "flos": 23075371301760.0, + "grad_norm": 2.4284688087757913, + "language_loss": 0.85356402, + "learning_rate": 2.2416681897335337e-06, + "loss": 0.87574494, + "num_input_tokens_seen": 85444245, + "step": 3972, + "time_per_iteration": 2.731938600540161 + }, + { + "auxiliary_loss_clip": 0.01104228, + "auxiliary_loss_mlp": 0.01086586, + "balance_loss_clip": 1.02781415, + "balance_loss_mlp": 1.00590491, + "epoch": 0.4777250045091084, + "flos": 31898119374720.0, + "grad_norm": 1.8054041803722296, + "language_loss": 0.66839665, + "learning_rate": 2.240894907521661e-06, + "loss": 0.69030476, + "num_input_tokens_seen": 85463325, + "step": 3973, + "time_per_iteration": 2.943988084793091 + }, + { + "auxiliary_loss_clip": 0.01126382, + "auxiliary_loss_mlp": 0.01086449, + "balance_loss_clip": 1.03120732, + "balance_loss_mlp": 1.00576806, + "epoch": 0.4778452473997475, + "flos": 24278163148800.0, + "grad_norm": 1.568354870814167, + "language_loss": 0.63559246, + "learning_rate": 2.240121588766223e-06, + "loss": 0.65772069, + "num_input_tokens_seen": 85483375, + "step": 3974, + "time_per_iteration": 2.7912063598632812 + }, + { + "auxiliary_loss_clip": 0.01123952, + "auxiliary_loss_mlp": 0.01087821, + "balance_loss_clip": 1.0294317, + "balance_loss_mlp": 1.00723553, + "epoch": 0.4779654902903866, + "flos": 31575031516800.0, + "grad_norm": 1.9575737484621658, + "language_loss": 0.71087825, + "learning_rate": 2.239348233584531e-06, + "loss": 0.73299599, + "num_input_tokens_seen": 85504230, + "step": 3975, + "time_per_iteration": 2.8269004821777344 + }, + { + "auxiliary_loss_clip": 0.01133474, + "auxiliary_loss_mlp": 0.01085772, + "balance_loss_clip": 1.02957273, + "balance_loss_mlp": 1.00518632, + "epoch": 0.47808573318102565, + "flos": 19500428344320.0, + "grad_norm": 6.0307575234267015, + "language_loss": 0.80751419, + "learning_rate": 2.2385748420939013e-06, + "loss": 0.82970661, + "num_input_tokens_seen": 85523425, + "step": 3976, + "time_per_iteration": 2.710458517074585 + }, + { + "auxiliary_loss_clip": 0.01144393, + "auxiliary_loss_mlp": 0.01085644, + "balance_loss_clip": 1.03251672, + "balance_loss_mlp": 1.00510645, + "epoch": 0.47820597607166476, + "flos": 22601135013120.0, + "grad_norm": 1.7541754182991727, + "language_loss": 0.71929753, + "learning_rate": 2.2378014144116583e-06, + "loss": 0.74159789, + "num_input_tokens_seen": 85542235, + "step": 3977, + "time_per_iteration": 2.7015864849090576 + }, + { + "auxiliary_loss_clip": 0.01144891, + "auxiliary_loss_mlp": 0.01085603, + "balance_loss_clip": 1.031901, + "balance_loss_mlp": 1.00492167, + "epoch": 0.4783262189623039, + "flos": 23003011353600.0, + "grad_norm": 1.7380134755356786, + "language_loss": 0.79620731, + "learning_rate": 2.23702795065513e-06, + "loss": 0.8185122, + "num_input_tokens_seen": 85561815, + "step": 3978, + "time_per_iteration": 3.525583028793335 + }, + { + "auxiliary_loss_clip": 0.01127203, + "auxiliary_loss_mlp": 0.01079901, + "balance_loss_clip": 1.03815103, + "balance_loss_mlp": 1.00055528, + "epoch": 0.47844646185294293, + "flos": 49772801226240.0, + "grad_norm": 0.9843321516522903, + "language_loss": 0.67484164, + "learning_rate": 2.2362544509416493e-06, + "loss": 0.69691265, + "num_input_tokens_seen": 85613930, + "step": 3979, + "time_per_iteration": 3.112248182296753 + }, + { + "auxiliary_loss_clip": 0.01123645, + "auxiliary_loss_mlp": 0.01086019, + "balance_loss_clip": 1.02945995, + "balance_loss_mlp": 1.00543392, + "epoch": 0.47856670474358204, + "flos": 20229558520320.0, + "grad_norm": 2.1751114358289927, + "language_loss": 0.83051515, + "learning_rate": 2.2354809153885572e-06, + "loss": 0.85261178, + "num_input_tokens_seen": 85631000, + "step": 3980, + "time_per_iteration": 2.761936902999878 + }, + { + "auxiliary_loss_clip": 0.01132107, + "auxiliary_loss_mlp": 0.01086752, + "balance_loss_clip": 1.02913213, + "balance_loss_mlp": 1.00607109, + "epoch": 0.47868694763422115, + "flos": 20990936131200.0, + "grad_norm": 2.462236674837698, + "language_loss": 0.83173525, + "learning_rate": 2.234707344113197e-06, + "loss": 0.85392392, + "num_input_tokens_seen": 85649095, + "step": 3981, + "time_per_iteration": 3.6200740337371826 + }, + { + "auxiliary_loss_clip": 0.01143524, + "auxiliary_loss_mlp": 0.01085328, + "balance_loss_clip": 1.03132558, + "balance_loss_mlp": 1.00488544, + "epoch": 0.4788071905248602, + "flos": 19026551191680.0, + "grad_norm": 1.7408441178646448, + "language_loss": 0.77605975, + "learning_rate": 2.233933737232919e-06, + "loss": 0.79834831, + "num_input_tokens_seen": 85666875, + "step": 3982, + "time_per_iteration": 2.6987340450286865 + }, + { + "auxiliary_loss_clip": 0.010976, + "auxiliary_loss_mlp": 0.00873184, + "balance_loss_clip": 1.02516294, + "balance_loss_mlp": 1.00027394, + "epoch": 0.4789274334154993, + "flos": 23002221254400.0, + "grad_norm": 1.71953855862242, + "language_loss": 0.78123021, + "learning_rate": 2.2331600948650793e-06, + "loss": 0.80093807, + "num_input_tokens_seen": 85687020, + "step": 3983, + "time_per_iteration": 2.7924907207489014 + }, + { + "auxiliary_loss_clip": 0.01110426, + "auxiliary_loss_mlp": 0.00873326, + "balance_loss_clip": 1.02624035, + "balance_loss_mlp": 1.00017643, + "epoch": 0.4790476763061384, + "flos": 23075586783360.0, + "grad_norm": 4.096919781642251, + "language_loss": 0.80247581, + "learning_rate": 2.2323864171270386e-06, + "loss": 0.82231331, + "num_input_tokens_seen": 85708290, + "step": 3984, + "time_per_iteration": 2.846698045730591 + }, + { + "auxiliary_loss_clip": 0.01114624, + "auxiliary_loss_mlp": 0.01085795, + "balance_loss_clip": 1.02771795, + "balance_loss_mlp": 1.00506687, + "epoch": 0.4791679191967775, + "flos": 21179288073600.0, + "grad_norm": 1.8185623438811163, + "language_loss": 0.72762936, + "learning_rate": 2.231612704136164e-06, + "loss": 0.74963355, + "num_input_tokens_seen": 85728660, + "step": 3985, + "time_per_iteration": 3.858332633972168 + }, + { + "auxiliary_loss_clip": 0.01134776, + "auxiliary_loss_mlp": 0.01085009, + "balance_loss_clip": 1.03076649, + "balance_loss_mlp": 1.00432861, + "epoch": 0.4792881620874166, + "flos": 22301495758080.0, + "grad_norm": 2.233919359269077, + "language_loss": 0.75037313, + "learning_rate": 2.2308389560098253e-06, + "loss": 0.77257097, + "num_input_tokens_seen": 85745035, + "step": 3986, + "time_per_iteration": 2.655082941055298 + }, + { + "auxiliary_loss_clip": 0.01096104, + "auxiliary_loss_mlp": 0.01086152, + "balance_loss_clip": 1.02741838, + "balance_loss_mlp": 1.00547123, + "epoch": 0.47940840497805565, + "flos": 17420877423360.0, + "grad_norm": 2.3374583495971897, + "language_loss": 0.76826429, + "learning_rate": 2.2300651728654008e-06, + "loss": 0.79008687, + "num_input_tokens_seen": 85760295, + "step": 3987, + "time_per_iteration": 3.6980738639831543 + }, + { + "auxiliary_loss_clip": 0.01125454, + "auxiliary_loss_mlp": 0.00873553, + "balance_loss_clip": 1.03712368, + "balance_loss_mlp": 1.0025363, + "epoch": 0.47952864786869476, + "flos": 65358175708800.0, + "grad_norm": 0.735891099605344, + "language_loss": 0.60207629, + "learning_rate": 2.229291354820272e-06, + "loss": 0.62206638, + "num_input_tokens_seen": 85821305, + "step": 3988, + "time_per_iteration": 3.276139736175537 + }, + { + "auxiliary_loss_clip": 0.01136204, + "auxiliary_loss_mlp": 0.01086801, + "balance_loss_clip": 1.03127098, + "balance_loss_mlp": 1.0061202, + "epoch": 0.47964889075933387, + "flos": 16799802336000.0, + "grad_norm": 1.9574938255501337, + "language_loss": 0.7614553, + "learning_rate": 2.228517501991828e-06, + "loss": 0.78368533, + "num_input_tokens_seen": 85840105, + "step": 3989, + "time_per_iteration": 2.697481870651245 + }, + { + "auxiliary_loss_clip": 0.0111683, + "auxiliary_loss_mlp": 0.01079396, + "balance_loss_clip": 1.03658175, + "balance_loss_mlp": 1.00005078, + "epoch": 0.4797691336499729, + "flos": 70079244808320.0, + "grad_norm": 0.8530792377016432, + "language_loss": 0.61056173, + "learning_rate": 2.22774361449746e-06, + "loss": 0.63252401, + "num_input_tokens_seen": 85896585, + "step": 3990, + "time_per_iteration": 3.3482565879821777 + }, + { + "auxiliary_loss_clip": 0.01087714, + "auxiliary_loss_mlp": 0.0108579, + "balance_loss_clip": 1.02223718, + "balance_loss_mlp": 1.00506115, + "epoch": 0.47988937654061203, + "flos": 18953329317120.0, + "grad_norm": 2.432082378430407, + "language_loss": 0.70252728, + "learning_rate": 2.2269696924545668e-06, + "loss": 0.7242623, + "num_input_tokens_seen": 85914415, + "step": 3991, + "time_per_iteration": 2.7967002391815186 + }, + { + "auxiliary_loss_clip": 0.01113397, + "auxiliary_loss_mlp": 0.01085425, + "balance_loss_clip": 1.02850282, + "balance_loss_mlp": 1.00503063, + "epoch": 0.48000961943125114, + "flos": 14461981649280.0, + "grad_norm": 2.180806210525636, + "language_loss": 0.77813935, + "learning_rate": 2.2261957359805523e-06, + "loss": 0.80012757, + "num_input_tokens_seen": 85931650, + "step": 3992, + "time_per_iteration": 2.836463212966919 + }, + { + "auxiliary_loss_clip": 0.01143571, + "auxiliary_loss_mlp": 0.01085593, + "balance_loss_clip": 1.03099668, + "balance_loss_mlp": 1.00505543, + "epoch": 0.4801298623218902, + "flos": 27051149105280.0, + "grad_norm": 1.8983366516880522, + "language_loss": 0.73842621, + "learning_rate": 2.225421745192823e-06, + "loss": 0.76071781, + "num_input_tokens_seen": 85951805, + "step": 3993, + "time_per_iteration": 2.649508237838745 + }, + { + "auxiliary_loss_clip": 0.01133226, + "auxiliary_loss_mlp": 0.01085571, + "balance_loss_clip": 1.0302701, + "balance_loss_mlp": 1.00493801, + "epoch": 0.4802501052125293, + "flos": 26355236031360.0, + "grad_norm": 2.318541767926605, + "language_loss": 0.78439158, + "learning_rate": 2.2246477202087955e-06, + "loss": 0.80657959, + "num_input_tokens_seen": 85972485, + "step": 3994, + "time_per_iteration": 2.745331048965454 + }, + { + "auxiliary_loss_clip": 0.01126014, + "auxiliary_loss_mlp": 0.01086511, + "balance_loss_clip": 1.02999735, + "balance_loss_mlp": 1.00597358, + "epoch": 0.4803703481031684, + "flos": 20993916960000.0, + "grad_norm": 1.7254514019252396, + "language_loss": 0.82825005, + "learning_rate": 2.223873661145887e-06, + "loss": 0.85037529, + "num_input_tokens_seen": 85992540, + "step": 3995, + "time_per_iteration": 2.8009121417999268 + }, + { + "auxiliary_loss_clip": 0.01119791, + "auxiliary_loss_mlp": 0.00873148, + "balance_loss_clip": 1.02695966, + "balance_loss_mlp": 1.00022376, + "epoch": 0.4804905909938075, + "flos": 20703722981760.0, + "grad_norm": 1.4697107328587857, + "language_loss": 0.71487916, + "learning_rate": 2.2230995681215226e-06, + "loss": 0.73480844, + "num_input_tokens_seen": 86012065, + "step": 3996, + "time_per_iteration": 2.748821258544922 + }, + { + "auxiliary_loss_clip": 0.01112502, + "auxiliary_loss_mlp": 0.01085548, + "balance_loss_clip": 1.02696466, + "balance_loss_mlp": 1.00501037, + "epoch": 0.4806108338844466, + "flos": 16654831044480.0, + "grad_norm": 2.789155941764809, + "language_loss": 0.7820062, + "learning_rate": 2.2223254412531305e-06, + "loss": 0.80398679, + "num_input_tokens_seen": 86029435, + "step": 3997, + "time_per_iteration": 2.797961711883545 + }, + { + "auxiliary_loss_clip": 0.01123245, + "auxiliary_loss_mlp": 0.01084793, + "balance_loss_clip": 1.02877021, + "balance_loss_mlp": 1.00425518, + "epoch": 0.4807310767750857, + "flos": 20011329440640.0, + "grad_norm": 2.000207393254033, + "language_loss": 0.82306612, + "learning_rate": 2.221551280658146e-06, + "loss": 0.84514648, + "num_input_tokens_seen": 86048495, + "step": 3998, + "time_per_iteration": 2.719731569290161 + }, + { + "auxiliary_loss_clip": 0.01106406, + "auxiliary_loss_mlp": 0.01085896, + "balance_loss_clip": 1.02856302, + "balance_loss_mlp": 1.00540566, + "epoch": 0.48085131966572475, + "flos": 23185257984000.0, + "grad_norm": 1.6511207320843007, + "language_loss": 0.74099833, + "learning_rate": 2.2207770864540085e-06, + "loss": 0.76292133, + "num_input_tokens_seen": 86067470, + "step": 3999, + "time_per_iteration": 2.850783348083496 + }, + { + "auxiliary_loss_clip": 0.01126611, + "auxiliary_loss_mlp": 0.01085963, + "balance_loss_clip": 1.03052497, + "balance_loss_mlp": 1.00547314, + "epoch": 0.48097156255636386, + "flos": 20558643949440.0, + "grad_norm": 2.9173921921099306, + "language_loss": 0.72795951, + "learning_rate": 2.220002858758162e-06, + "loss": 0.75008523, + "num_input_tokens_seen": 86085460, + "step": 4000, + "time_per_iteration": 2.748840808868408 + }, + { + "auxiliary_loss_clip": 0.01124915, + "auxiliary_loss_mlp": 0.01079293, + "balance_loss_clip": 1.03577924, + "balance_loss_mlp": 0.99994767, + "epoch": 0.481091805447003, + "flos": 70511608817280.0, + "grad_norm": 0.8747804003947297, + "language_loss": 0.60859537, + "learning_rate": 2.2192285976880573e-06, + "loss": 0.63063753, + "num_input_tokens_seen": 86149715, + "step": 4001, + "time_per_iteration": 3.278440475463867 + }, + { + "auxiliary_loss_clip": 0.01114454, + "auxiliary_loss_mlp": 0.00872923, + "balance_loss_clip": 1.02815366, + "balance_loss_mlp": 1.00023329, + "epoch": 0.48121204833764203, + "flos": 36428214839040.0, + "grad_norm": 1.5530379920714747, + "language_loss": 0.8027221, + "learning_rate": 2.2184543033611485e-06, + "loss": 0.82259589, + "num_input_tokens_seen": 86170795, + "step": 4002, + "time_per_iteration": 2.9323506355285645 + }, + { + "auxiliary_loss_clip": 0.01133582, + "auxiliary_loss_mlp": 0.01085052, + "balance_loss_clip": 1.02935195, + "balance_loss_mlp": 1.0044663, + "epoch": 0.48133229122828114, + "flos": 27490264871040.0, + "grad_norm": 2.061923023685186, + "language_loss": 0.8185541, + "learning_rate": 2.2176799758948957e-06, + "loss": 0.84074044, + "num_input_tokens_seen": 86190955, + "step": 4003, + "time_per_iteration": 3.650573968887329 + }, + { + "auxiliary_loss_clip": 0.0112501, + "auxiliary_loss_mlp": 0.01084943, + "balance_loss_clip": 1.02943742, + "balance_loss_mlp": 1.00430989, + "epoch": 0.4814525341189202, + "flos": 43072802179200.0, + "grad_norm": 2.7215330878476847, + "language_loss": 0.73232967, + "learning_rate": 2.2169056154067635e-06, + "loss": 0.75442916, + "num_input_tokens_seen": 86214875, + "step": 4004, + "time_per_iteration": 2.9319913387298584 + }, + { + "auxiliary_loss_clip": 0.01132281, + "auxiliary_loss_mlp": 0.00873202, + "balance_loss_clip": 1.02943766, + "balance_loss_mlp": 1.00018692, + "epoch": 0.4815727770095593, + "flos": 24236901400320.0, + "grad_norm": 1.8249028292873262, + "language_loss": 0.82498705, + "learning_rate": 2.216131222014222e-06, + "loss": 0.84504187, + "num_input_tokens_seen": 86232950, + "step": 4005, + "time_per_iteration": 2.7205867767333984 + }, + { + "auxiliary_loss_clip": 0.01116442, + "auxiliary_loss_mlp": 0.0108669, + "balance_loss_clip": 1.02953935, + "balance_loss_mlp": 1.00586569, + "epoch": 0.4816930199001984, + "flos": 18113630100480.0, + "grad_norm": 3.3649891328573918, + "language_loss": 0.80279392, + "learning_rate": 2.2153567958347455e-06, + "loss": 0.82482523, + "num_input_tokens_seen": 86249160, + "step": 4006, + "time_per_iteration": 2.7483162879943848 + }, + { + "auxiliary_loss_clip": 0.01122468, + "auxiliary_loss_mlp": 0.01084638, + "balance_loss_clip": 1.02882886, + "balance_loss_mlp": 1.00414789, + "epoch": 0.48181326279083747, + "flos": 17274720983040.0, + "grad_norm": 2.4140780350976088, + "language_loss": 0.79715526, + "learning_rate": 2.214582336985815e-06, + "loss": 0.81922632, + "num_input_tokens_seen": 86267060, + "step": 4007, + "time_per_iteration": 3.6963400840759277 + }, + { + "auxiliary_loss_clip": 0.0111655, + "auxiliary_loss_mlp": 0.01085566, + "balance_loss_clip": 1.02699852, + "balance_loss_mlp": 1.0048852, + "epoch": 0.4819335056814766, + "flos": 14903252231040.0, + "grad_norm": 2.085262442210563, + "language_loss": 0.66148126, + "learning_rate": 2.2138078455849142e-06, + "loss": 0.68350244, + "num_input_tokens_seen": 86285055, + "step": 4008, + "time_per_iteration": 2.6870102882385254 + }, + { + "auxiliary_loss_clip": 0.01119852, + "auxiliary_loss_mlp": 0.01084078, + "balance_loss_clip": 1.03129733, + "balance_loss_mlp": 1.00354075, + "epoch": 0.4820537485721157, + "flos": 19244888012160.0, + "grad_norm": 1.9763979209599158, + "language_loss": 0.78731048, + "learning_rate": 2.2130333217495334e-06, + "loss": 0.80934978, + "num_input_tokens_seen": 86304225, + "step": 4009, + "time_per_iteration": 2.7093749046325684 + }, + { + "auxiliary_loss_clip": 0.01114212, + "auxiliary_loss_mlp": 0.01085981, + "balance_loss_clip": 1.02613306, + "balance_loss_mlp": 1.00515723, + "epoch": 0.48217399146275475, + "flos": 16033791870720.0, + "grad_norm": 3.090877532204909, + "language_loss": 0.68133169, + "learning_rate": 2.2122587655971665e-06, + "loss": 0.70333362, + "num_input_tokens_seen": 86319170, + "step": 4010, + "time_per_iteration": 3.733642578125 + }, + { + "auxiliary_loss_clip": 0.01124382, + "auxiliary_loss_mlp": 0.01085338, + "balance_loss_clip": 1.02942705, + "balance_loss_mlp": 1.00475216, + "epoch": 0.48229423435339386, + "flos": 24134197438080.0, + "grad_norm": 1.7832695527113536, + "language_loss": 0.63888013, + "learning_rate": 2.211484177245314e-06, + "loss": 0.66097736, + "num_input_tokens_seen": 86338760, + "step": 4011, + "time_per_iteration": 2.723989963531494 + }, + { + "auxiliary_loss_clip": 0.01144308, + "auxiliary_loss_mlp": 0.01084572, + "balance_loss_clip": 1.03159785, + "balance_loss_mlp": 1.00389123, + "epoch": 0.48241447724403297, + "flos": 23805435231360.0, + "grad_norm": 1.985238445715317, + "language_loss": 0.72093093, + "learning_rate": 2.21070955681148e-06, + "loss": 0.74321973, + "num_input_tokens_seen": 86357865, + "step": 4012, + "time_per_iteration": 2.7024905681610107 + }, + { + "auxiliary_loss_clip": 0.01113768, + "auxiliary_loss_mlp": 0.0108544, + "balance_loss_clip": 1.02805424, + "balance_loss_mlp": 1.0049026, + "epoch": 0.482534720134672, + "flos": 23110312256640.0, + "grad_norm": 1.4955124296230093, + "language_loss": 0.7817902, + "learning_rate": 2.209934904413174e-06, + "loss": 0.80378228, + "num_input_tokens_seen": 86379470, + "step": 4013, + "time_per_iteration": 3.7802445888519287 + }, + { + "auxiliary_loss_clip": 0.01090675, + "auxiliary_loss_mlp": 0.0108548, + "balance_loss_clip": 1.02509212, + "balance_loss_mlp": 1.00479913, + "epoch": 0.48265496302531113, + "flos": 20923819568640.0, + "grad_norm": 4.479313730819171, + "language_loss": 0.71527964, + "learning_rate": 2.2091602201679095e-06, + "loss": 0.73704123, + "num_input_tokens_seen": 86399080, + "step": 4014, + "time_per_iteration": 2.9431371688842773 + }, + { + "auxiliary_loss_clip": 0.01115962, + "auxiliary_loss_mlp": 0.01085014, + "balance_loss_clip": 1.02910376, + "balance_loss_mlp": 1.0045234, + "epoch": 0.48277520591595025, + "flos": 15231152511360.0, + "grad_norm": 2.0940488723804975, + "language_loss": 0.83322358, + "learning_rate": 2.208385504193206e-06, + "loss": 0.85523331, + "num_input_tokens_seen": 86416580, + "step": 4015, + "time_per_iteration": 2.7523887157440186 + }, + { + "auxiliary_loss_clip": 0.01143317, + "auxiliary_loss_mlp": 0.01084301, + "balance_loss_clip": 1.03091824, + "balance_loss_mlp": 1.00352514, + "epoch": 0.4828954488065893, + "flos": 17858664385920.0, + "grad_norm": 1.9665630510769614, + "language_loss": 0.8121351, + "learning_rate": 2.2076107566065873e-06, + "loss": 0.83441126, + "num_input_tokens_seen": 86434365, + "step": 4016, + "time_per_iteration": 2.562269449234009 + }, + { + "auxiliary_loss_clip": 0.01119391, + "auxiliary_loss_mlp": 0.0108431, + "balance_loss_clip": 1.03145885, + "balance_loss_mlp": 1.00372493, + "epoch": 0.4830156916972284, + "flos": 32087405070720.0, + "grad_norm": 2.3102630692074446, + "language_loss": 0.75780839, + "learning_rate": 2.2068359775255816e-06, + "loss": 0.77984542, + "num_input_tokens_seen": 86452675, + "step": 4017, + "time_per_iteration": 2.767307996749878 + }, + { + "auxiliary_loss_clip": 0.01098171, + "auxiliary_loss_mlp": 0.01083958, + "balance_loss_clip": 1.02657533, + "balance_loss_mlp": 1.00342011, + "epoch": 0.48313593458786747, + "flos": 21871717528320.0, + "grad_norm": 2.9146754916912343, + "language_loss": 0.7870568, + "learning_rate": 2.206061167067723e-06, + "loss": 0.80887806, + "num_input_tokens_seen": 86470785, + "step": 4018, + "time_per_iteration": 2.8885746002197266 + }, + { + "auxiliary_loss_clip": 0.01115801, + "auxiliary_loss_mlp": 0.01085737, + "balance_loss_clip": 1.02850485, + "balance_loss_mlp": 1.00510406, + "epoch": 0.4832561774785066, + "flos": 22601206840320.0, + "grad_norm": 3.4157806899435155, + "language_loss": 0.79516506, + "learning_rate": 2.205286325350549e-06, + "loss": 0.8171804, + "num_input_tokens_seen": 86489850, + "step": 4019, + "time_per_iteration": 2.804044723510742 + }, + { + "auxiliary_loss_clip": 0.01103411, + "auxiliary_loss_mlp": 0.01085845, + "balance_loss_clip": 1.02635419, + "balance_loss_mlp": 1.00525928, + "epoch": 0.4833764203691457, + "flos": 13437342282240.0, + "grad_norm": 1.9821682889358858, + "language_loss": 0.72568285, + "learning_rate": 2.204511452491603e-06, + "loss": 0.7475754, + "num_input_tokens_seen": 86506475, + "step": 4020, + "time_per_iteration": 2.754302501678467 + }, + { + "auxiliary_loss_clip": 0.0114407, + "auxiliary_loss_mlp": 0.01085308, + "balance_loss_clip": 1.03217721, + "balance_loss_mlp": 1.0048182, + "epoch": 0.48349666325978474, + "flos": 44128036955520.0, + "grad_norm": 1.6179289634506808, + "language_loss": 0.74739194, + "learning_rate": 2.2037365486084316e-06, + "loss": 0.76968575, + "num_input_tokens_seen": 86529715, + "step": 4021, + "time_per_iteration": 2.8605921268463135 + }, + { + "auxiliary_loss_clip": 0.01101709, + "auxiliary_loss_mlp": 0.01085497, + "balance_loss_clip": 1.02909827, + "balance_loss_mlp": 1.0048157, + "epoch": 0.48361690615042385, + "flos": 26028377245440.0, + "grad_norm": 2.035608011183867, + "language_loss": 0.78192157, + "learning_rate": 2.2029616138185886e-06, + "loss": 0.80379367, + "num_input_tokens_seen": 86548715, + "step": 4022, + "time_per_iteration": 2.757645606994629 + }, + { + "auxiliary_loss_clip": 0.0111208, + "auxiliary_loss_mlp": 0.01085307, + "balance_loss_clip": 1.0278573, + "balance_loss_mlp": 1.00481701, + "epoch": 0.48373714904106296, + "flos": 22273306560000.0, + "grad_norm": 1.6158924292112269, + "language_loss": 0.82510287, + "learning_rate": 2.202186648239629e-06, + "loss": 0.84707677, + "num_input_tokens_seen": 86568650, + "step": 4023, + "time_per_iteration": 2.7801008224487305 + }, + { + "auxiliary_loss_clip": 0.01130233, + "auxiliary_loss_mlp": 0.01084645, + "balance_loss_clip": 1.02829301, + "balance_loss_mlp": 1.0041554, + "epoch": 0.483857391931702, + "flos": 28292293699200.0, + "grad_norm": 1.5893843725001917, + "language_loss": 0.71509981, + "learning_rate": 2.201411651989117e-06, + "loss": 0.7372486, + "num_input_tokens_seen": 86590630, + "step": 4024, + "time_per_iteration": 2.781148910522461 + }, + { + "auxiliary_loss_clip": 0.0112106, + "auxiliary_loss_mlp": 0.00873136, + "balance_loss_clip": 1.02901983, + "balance_loss_mlp": 1.0001967, + "epoch": 0.48397763482234113, + "flos": 27418048577280.0, + "grad_norm": 1.7617915997927418, + "language_loss": 0.77850497, + "learning_rate": 2.2006366251846167e-06, + "loss": 0.79844695, + "num_input_tokens_seen": 86611270, + "step": 4025, + "time_per_iteration": 2.837989568710327 + }, + { + "auxiliary_loss_clip": 0.01124704, + "auxiliary_loss_mlp": 0.01084779, + "balance_loss_clip": 1.02940607, + "balance_loss_mlp": 1.00438452, + "epoch": 0.48409787771298024, + "flos": 16797252470400.0, + "grad_norm": 2.3259991469745103, + "language_loss": 0.75162435, + "learning_rate": 2.1998615679436997e-06, + "loss": 0.77371913, + "num_input_tokens_seen": 86628810, + "step": 4026, + "time_per_iteration": 2.763787031173706 + }, + { + "auxiliary_loss_clip": 0.01125959, + "auxiliary_loss_mlp": 0.01086219, + "balance_loss_clip": 1.02959466, + "balance_loss_mlp": 1.0054909, + "epoch": 0.4842181206036193, + "flos": 25083496028160.0, + "grad_norm": 2.2654914605085, + "language_loss": 0.77728832, + "learning_rate": 2.199086480383942e-06, + "loss": 0.7994101, + "num_input_tokens_seen": 86648185, + "step": 4027, + "time_per_iteration": 2.7573611736297607 + }, + { + "auxiliary_loss_clip": 0.01127008, + "auxiliary_loss_mlp": 0.01088384, + "balance_loss_clip": 1.03047991, + "balance_loss_mlp": 1.00756037, + "epoch": 0.4843383634942584, + "flos": 30372311496960.0, + "grad_norm": 2.585593451041886, + "language_loss": 0.67698967, + "learning_rate": 2.1983113626229234e-06, + "loss": 0.69914353, + "num_input_tokens_seen": 86667435, + "step": 4028, + "time_per_iteration": 2.8129653930664062 + }, + { + "auxiliary_loss_clip": 0.01117426, + "auxiliary_loss_mlp": 0.00873116, + "balance_loss_clip": 1.02922249, + "balance_loss_mlp": 1.00019681, + "epoch": 0.4844586063848975, + "flos": 20413564917120.0, + "grad_norm": 1.654057150818865, + "language_loss": 0.78431541, + "learning_rate": 2.1975362147782293e-06, + "loss": 0.8042208, + "num_input_tokens_seen": 86686630, + "step": 4029, + "time_per_iteration": 3.7534303665161133 + }, + { + "auxiliary_loss_clip": 0.01104905, + "auxiliary_loss_mlp": 0.01080336, + "balance_loss_clip": 1.02470648, + "balance_loss_mlp": 1.00098991, + "epoch": 0.48457884927553657, + "flos": 70303722854400.0, + "grad_norm": 0.6925480131659555, + "language_loss": 0.54124522, + "learning_rate": 2.196761036967448e-06, + "loss": 0.5630976, + "num_input_tokens_seen": 86754595, + "step": 4030, + "time_per_iteration": 3.40236496925354 + }, + { + "auxiliary_loss_clip": 0.01131814, + "auxiliary_loss_mlp": 0.01085682, + "balance_loss_clip": 1.02920651, + "balance_loss_mlp": 1.00514448, + "epoch": 0.4846990921661757, + "flos": 19934516206080.0, + "grad_norm": 1.6215322333614848, + "language_loss": 0.7752738, + "learning_rate": 2.1959858293081743e-06, + "loss": 0.79744875, + "num_input_tokens_seen": 86773730, + "step": 4031, + "time_per_iteration": 2.703139543533325 + }, + { + "auxiliary_loss_clip": 0.01113044, + "auxiliary_loss_mlp": 0.01085893, + "balance_loss_clip": 1.02748287, + "balance_loss_mlp": 1.00540257, + "epoch": 0.4848193350568148, + "flos": 23075945919360.0, + "grad_norm": 5.226651288887495, + "language_loss": 0.75932163, + "learning_rate": 2.1952105919180056e-06, + "loss": 0.78131104, + "num_input_tokens_seen": 86792985, + "step": 4032, + "time_per_iteration": 2.7757797241210938 + }, + { + "auxiliary_loss_clip": 0.01122065, + "auxiliary_loss_mlp": 0.01084155, + "balance_loss_clip": 1.02796805, + "balance_loss_mlp": 1.00361729, + "epoch": 0.48493957794745385, + "flos": 22455481363200.0, + "grad_norm": 2.363802937936541, + "language_loss": 0.6814723, + "learning_rate": 2.1944353249145456e-06, + "loss": 0.70353448, + "num_input_tokens_seen": 86812095, + "step": 4033, + "time_per_iteration": 3.6585493087768555 + }, + { + "auxiliary_loss_clip": 0.01143953, + "auxiliary_loss_mlp": 0.01084773, + "balance_loss_clip": 1.03131557, + "balance_loss_mlp": 1.00433111, + "epoch": 0.48505982083809296, + "flos": 25046112948480.0, + "grad_norm": 1.6943743596390866, + "language_loss": 0.7482332, + "learning_rate": 2.193660028415401e-06, + "loss": 0.77052045, + "num_input_tokens_seen": 86832875, + "step": 4034, + "time_per_iteration": 2.7014994621276855 + }, + { + "auxiliary_loss_clip": 0.01124199, + "auxiliary_loss_mlp": 0.01084678, + "balance_loss_clip": 1.02879119, + "balance_loss_mlp": 1.00423598, + "epoch": 0.485180063728732, + "flos": 26761386090240.0, + "grad_norm": 2.021567969547527, + "language_loss": 0.82111526, + "learning_rate": 2.1928847025381852e-06, + "loss": 0.84320402, + "num_input_tokens_seen": 86853480, + "step": 4035, + "time_per_iteration": 3.732304573059082 + }, + { + "auxiliary_loss_clip": 0.0113302, + "auxiliary_loss_mlp": 0.01085461, + "balance_loss_clip": 1.02868831, + "balance_loss_mlp": 1.0048275, + "epoch": 0.4853003066193711, + "flos": 24059143969920.0, + "grad_norm": 1.5785993123117958, + "language_loss": 0.84001195, + "learning_rate": 2.192109347400512e-06, + "loss": 0.86219674, + "num_input_tokens_seen": 86873695, + "step": 4036, + "time_per_iteration": 2.710218906402588 + }, + { + "auxiliary_loss_clip": 0.01123971, + "auxiliary_loss_mlp": 0.01085557, + "balance_loss_clip": 1.02880132, + "balance_loss_mlp": 1.00482821, + "epoch": 0.48542054951001024, + "flos": 23076376882560.0, + "grad_norm": 2.2900175155677407, + "language_loss": 0.78973997, + "learning_rate": 2.191333963120004e-06, + "loss": 0.81183517, + "num_input_tokens_seen": 86892675, + "step": 4037, + "time_per_iteration": 2.7547099590301514 + }, + { + "auxiliary_loss_clip": 0.01119787, + "auxiliary_loss_mlp": 0.01085186, + "balance_loss_clip": 1.02618992, + "balance_loss_mlp": 1.00455248, + "epoch": 0.4855407924006493, + "flos": 25664889565440.0, + "grad_norm": 2.167328626879115, + "language_loss": 0.69995499, + "learning_rate": 2.190558549814286e-06, + "loss": 0.72200471, + "num_input_tokens_seen": 86912835, + "step": 4038, + "time_per_iteration": 3.650829315185547 + }, + { + "auxiliary_loss_clip": 0.01127111, + "auxiliary_loss_mlp": 0.0108599, + "balance_loss_clip": 1.03054404, + "balance_loss_mlp": 1.0054996, + "epoch": 0.4856610352912884, + "flos": 23987933256960.0, + "grad_norm": 1.9392258889582652, + "language_loss": 0.79702938, + "learning_rate": 2.1897831076009872e-06, + "loss": 0.81916034, + "num_input_tokens_seen": 86932475, + "step": 4039, + "time_per_iteration": 2.8016326427459717 + }, + { + "auxiliary_loss_clip": 0.01131993, + "auxiliary_loss_mlp": 0.01085802, + "balance_loss_clip": 1.02918983, + "balance_loss_mlp": 1.00531149, + "epoch": 0.4857812781819275, + "flos": 24096814358400.0, + "grad_norm": 1.6731430574879746, + "language_loss": 0.79559129, + "learning_rate": 2.1890076365977426e-06, + "loss": 0.81776923, + "num_input_tokens_seen": 86952300, + "step": 4040, + "time_per_iteration": 2.729240655899048 + }, + { + "auxiliary_loss_clip": 0.01111746, + "auxiliary_loss_mlp": 0.01079784, + "balance_loss_clip": 1.03156996, + "balance_loss_mlp": 1.00043797, + "epoch": 0.48590152107256657, + "flos": 56266635185280.0, + "grad_norm": 0.8555217561541364, + "language_loss": 0.52865469, + "learning_rate": 2.188232136922189e-06, + "loss": 0.55057001, + "num_input_tokens_seen": 87010420, + "step": 4041, + "time_per_iteration": 3.262538194656372 + }, + { + "auxiliary_loss_clip": 0.01079022, + "auxiliary_loss_mlp": 0.01083842, + "balance_loss_clip": 1.02633989, + "balance_loss_mlp": 1.00325608, + "epoch": 0.4860217639632057, + "flos": 20046988667520.0, + "grad_norm": 2.0375572170466834, + "language_loss": 0.75591302, + "learning_rate": 2.187456608691971e-06, + "loss": 0.7775417, + "num_input_tokens_seen": 87029295, + "step": 4042, + "time_per_iteration": 3.0269250869750977 + }, + { + "auxiliary_loss_clip": 0.01115364, + "auxiliary_loss_mlp": 0.01085507, + "balance_loss_clip": 1.02872229, + "balance_loss_mlp": 1.00496936, + "epoch": 0.4861420068538448, + "flos": 17822143232640.0, + "grad_norm": 1.72983275399832, + "language_loss": 0.87762547, + "learning_rate": 2.1866810520247334e-06, + "loss": 0.89963412, + "num_input_tokens_seen": 87048165, + "step": 4043, + "time_per_iteration": 2.7845776081085205 + }, + { + "auxiliary_loss_clip": 0.01133984, + "auxiliary_loss_mlp": 0.01085675, + "balance_loss_clip": 1.0296216, + "balance_loss_mlp": 1.00499392, + "epoch": 0.48626224974448384, + "flos": 26250125857920.0, + "grad_norm": 1.7362556076617635, + "language_loss": 0.64864302, + "learning_rate": 2.185905467038129e-06, + "loss": 0.67083961, + "num_input_tokens_seen": 87067070, + "step": 4044, + "time_per_iteration": 2.7816731929779053 + }, + { + "auxiliary_loss_clip": 0.01143417, + "auxiliary_loss_mlp": 0.01086184, + "balance_loss_clip": 1.03174412, + "balance_loss_mlp": 1.00559902, + "epoch": 0.48638249263512295, + "flos": 22054502862720.0, + "grad_norm": 2.006869690857314, + "language_loss": 0.77295309, + "learning_rate": 2.1851298538498127e-06, + "loss": 0.7952491, + "num_input_tokens_seen": 87086785, + "step": 4045, + "time_per_iteration": 2.6815683841705322 + }, + { + "auxiliary_loss_clip": 0.0113386, + "auxiliary_loss_mlp": 0.00873147, + "balance_loss_clip": 1.03011227, + "balance_loss_mlp": 1.00015235, + "epoch": 0.48650273552576206, + "flos": 25119945354240.0, + "grad_norm": 1.7994049893571065, + "language_loss": 0.80130464, + "learning_rate": 2.184354212577446e-06, + "loss": 0.82137471, + "num_input_tokens_seen": 87107090, + "step": 4046, + "time_per_iteration": 2.7428178787231445 + }, + { + "auxiliary_loss_clip": 0.01142478, + "auxiliary_loss_mlp": 0.01084702, + "balance_loss_clip": 1.02958536, + "balance_loss_mlp": 1.00397372, + "epoch": 0.4866229784164011, + "flos": 17456931699840.0, + "grad_norm": 2.4573034817404142, + "language_loss": 0.62623632, + "learning_rate": 2.1835785433386907e-06, + "loss": 0.64850813, + "num_input_tokens_seen": 87125905, + "step": 4047, + "time_per_iteration": 2.6048126220703125 + }, + { + "auxiliary_loss_clip": 0.01112658, + "auxiliary_loss_mlp": 0.01084675, + "balance_loss_clip": 1.02740645, + "balance_loss_mlp": 1.00418496, + "epoch": 0.48674322130704023, + "flos": 23331127115520.0, + "grad_norm": 1.7051893877381215, + "language_loss": 0.65459448, + "learning_rate": 2.182802846251216e-06, + "loss": 0.67656779, + "num_input_tokens_seen": 87146175, + "step": 4048, + "time_per_iteration": 2.819382905960083 + }, + { + "auxiliary_loss_clip": 0.01114815, + "auxiliary_loss_mlp": 0.01085376, + "balance_loss_clip": 1.02835357, + "balance_loss_mlp": 1.00483799, + "epoch": 0.4868634641976793, + "flos": 28804344030720.0, + "grad_norm": 1.76496427739044, + "language_loss": 0.72142243, + "learning_rate": 2.182027121432696e-06, + "loss": 0.74342436, + "num_input_tokens_seen": 87166800, + "step": 4049, + "time_per_iteration": 2.770709753036499 + }, + { + "auxiliary_loss_clip": 0.01143352, + "auxiliary_loss_mlp": 0.01085773, + "balance_loss_clip": 1.03110003, + "balance_loss_mlp": 1.00514007, + "epoch": 0.4869837070883184, + "flos": 19025976574080.0, + "grad_norm": 1.7067159983468845, + "language_loss": 0.82154214, + "learning_rate": 2.1812513690008054e-06, + "loss": 0.84383345, + "num_input_tokens_seen": 87185920, + "step": 4050, + "time_per_iteration": 2.681021213531494 + }, + { + "auxiliary_loss_clip": 0.01134155, + "auxiliary_loss_mlp": 0.01085104, + "balance_loss_clip": 1.02913165, + "balance_loss_mlp": 1.00451875, + "epoch": 0.4871039499789575, + "flos": 15121409483520.0, + "grad_norm": 1.9070325588626542, + "language_loss": 0.79533505, + "learning_rate": 2.180475589073227e-06, + "loss": 0.81752771, + "num_input_tokens_seen": 87203620, + "step": 4051, + "time_per_iteration": 2.651984930038452 + }, + { + "auxiliary_loss_clip": 0.01134567, + "auxiliary_loss_mlp": 0.0108501, + "balance_loss_clip": 1.02996469, + "balance_loss_mlp": 1.00456762, + "epoch": 0.48722419286959656, + "flos": 26174066808960.0, + "grad_norm": 1.5920328555623768, + "language_loss": 0.73348677, + "learning_rate": 2.1796997817676456e-06, + "loss": 0.75568259, + "num_input_tokens_seen": 87224630, + "step": 4052, + "time_per_iteration": 2.6952478885650635 + }, + { + "auxiliary_loss_clip": 0.01134473, + "auxiliary_loss_mlp": 0.00872962, + "balance_loss_clip": 1.03062057, + "balance_loss_mlp": 1.00017917, + "epoch": 0.4873444357602357, + "flos": 24026142349440.0, + "grad_norm": 1.6179005242929523, + "language_loss": 0.67303795, + "learning_rate": 2.1789239472017494e-06, + "loss": 0.69311231, + "num_input_tokens_seen": 87246280, + "step": 4053, + "time_per_iteration": 2.7108497619628906 + }, + { + "auxiliary_loss_clip": 0.01114865, + "auxiliary_loss_mlp": 0.01085637, + "balance_loss_clip": 1.02787077, + "balance_loss_mlp": 1.00505149, + "epoch": 0.4874646786508748, + "flos": 22820441500800.0, + "grad_norm": 2.0455650913941996, + "language_loss": 0.72995752, + "learning_rate": 2.1781480854932326e-06, + "loss": 0.75196254, + "num_input_tokens_seen": 87266045, + "step": 4054, + "time_per_iteration": 2.7777047157287598 + }, + { + "auxiliary_loss_clip": 0.01102454, + "auxiliary_loss_mlp": 0.01084176, + "balance_loss_clip": 1.02589214, + "balance_loss_mlp": 1.00373411, + "epoch": 0.48758492154151384, + "flos": 21287594557440.0, + "grad_norm": 1.8560843656729278, + "language_loss": 0.79116499, + "learning_rate": 2.1773721967597933e-06, + "loss": 0.81303132, + "num_input_tokens_seen": 87284495, + "step": 4055, + "time_per_iteration": 3.765533685684204 + }, + { + "auxiliary_loss_clip": 0.0110986, + "auxiliary_loss_mlp": 0.01079574, + "balance_loss_clip": 1.03027225, + "balance_loss_mlp": 1.00022817, + "epoch": 0.48770516443215295, + "flos": 62244109180800.0, + "grad_norm": 0.8461995401988958, + "language_loss": 0.57419574, + "learning_rate": 2.1765962811191322e-06, + "loss": 0.59609008, + "num_input_tokens_seen": 87338960, + "step": 4056, + "time_per_iteration": 3.2518656253814697 + }, + { + "auxiliary_loss_clip": 0.01089084, + "auxiliary_loss_mlp": 0.01079417, + "balance_loss_clip": 1.02594829, + "balance_loss_mlp": 1.00007153, + "epoch": 0.48782540732279206, + "flos": 66133451882880.0, + "grad_norm": 0.8232839097359738, + "language_loss": 0.62050307, + "learning_rate": 2.1758203386889566e-06, + "loss": 0.64218807, + "num_input_tokens_seen": 87401730, + "step": 4057, + "time_per_iteration": 3.351497173309326 + }, + { + "auxiliary_loss_clip": 0.01115305, + "auxiliary_loss_mlp": 0.0087303, + "balance_loss_clip": 1.02836347, + "balance_loss_mlp": 1.00010395, + "epoch": 0.4879456502134311, + "flos": 14607922608000.0, + "grad_norm": 1.9163153918284754, + "language_loss": 0.84211701, + "learning_rate": 2.1750443695869746e-06, + "loss": 0.86200035, + "num_input_tokens_seen": 87417300, + "step": 4058, + "time_per_iteration": 3.600038766860962 + }, + { + "auxiliary_loss_clip": 0.01132716, + "auxiliary_loss_mlp": 0.01086084, + "balance_loss_clip": 1.02807653, + "balance_loss_mlp": 1.00554657, + "epoch": 0.4880658931040702, + "flos": 19500464257920.0, + "grad_norm": 1.7170574512343268, + "language_loss": 0.85693085, + "learning_rate": 2.174268373930901e-06, + "loss": 0.87911886, + "num_input_tokens_seen": 87434815, + "step": 4059, + "time_per_iteration": 2.665200710296631 + }, + { + "auxiliary_loss_clip": 0.01110045, + "auxiliary_loss_mlp": 0.00872944, + "balance_loss_clip": 1.02655101, + "balance_loss_mlp": 1.00016832, + "epoch": 0.48818613599470934, + "flos": 16723060928640.0, + "grad_norm": 3.406748600984699, + "language_loss": 0.79674149, + "learning_rate": 2.1734923518384537e-06, + "loss": 0.81657135, + "num_input_tokens_seen": 87451420, + "step": 4060, + "time_per_iteration": 3.6249284744262695 + }, + { + "auxiliary_loss_clip": 0.01113338, + "auxiliary_loss_mlp": 0.010856, + "balance_loss_clip": 1.02857816, + "balance_loss_mlp": 1.00525331, + "epoch": 0.4883063788853484, + "flos": 26756932803840.0, + "grad_norm": 1.7674375997427365, + "language_loss": 0.82044637, + "learning_rate": 2.1727163034273547e-06, + "loss": 0.84243572, + "num_input_tokens_seen": 87469585, + "step": 4061, + "time_per_iteration": 2.8208236694335938 + }, + { + "auxiliary_loss_clip": 0.01134782, + "auxiliary_loss_mlp": 0.0108412, + "balance_loss_clip": 1.03054976, + "balance_loss_mlp": 1.00353432, + "epoch": 0.4884266217759875, + "flos": 16763388923520.0, + "grad_norm": 2.261781437824032, + "language_loss": 0.78694808, + "learning_rate": 2.17194022881533e-06, + "loss": 0.80913711, + "num_input_tokens_seen": 87485675, + "step": 4062, + "time_per_iteration": 2.667792320251465 + }, + { + "auxiliary_loss_clip": 0.01123761, + "auxiliary_loss_mlp": 0.01084314, + "balance_loss_clip": 1.02916944, + "balance_loss_mlp": 1.00368106, + "epoch": 0.4885468646666266, + "flos": 24207132003840.0, + "grad_norm": 1.8090040782965686, + "language_loss": 0.67382073, + "learning_rate": 2.1711641281201092e-06, + "loss": 0.69590151, + "num_input_tokens_seen": 87505605, + "step": 4063, + "time_per_iteration": 3.596956968307495 + }, + { + "auxiliary_loss_clip": 0.01131991, + "auxiliary_loss_mlp": 0.01085309, + "balance_loss_clip": 1.02962422, + "balance_loss_mlp": 1.00477076, + "epoch": 0.48866710755726567, + "flos": 14610795696000.0, + "grad_norm": 2.7969575885282936, + "language_loss": 0.7942844, + "learning_rate": 2.1703880014594264e-06, + "loss": 0.81645745, + "num_input_tokens_seen": 87523195, + "step": 4064, + "time_per_iteration": 2.705674886703491 + }, + { + "auxiliary_loss_clip": 0.01091688, + "auxiliary_loss_mlp": 0.01086421, + "balance_loss_clip": 1.02339303, + "balance_loss_mlp": 1.00597858, + "epoch": 0.4887873504479048, + "flos": 28804451771520.0, + "grad_norm": 1.8706326914070635, + "language_loss": 0.73928034, + "learning_rate": 2.1696118489510182e-06, + "loss": 0.76106143, + "num_input_tokens_seen": 87544125, + "step": 4065, + "time_per_iteration": 2.9010262489318848 + }, + { + "auxiliary_loss_clip": 0.01115534, + "auxiliary_loss_mlp": 0.00873093, + "balance_loss_clip": 1.02884507, + "balance_loss_mlp": 1.00019968, + "epoch": 0.48890759333854383, + "flos": 22784387224320.0, + "grad_norm": 13.198773021256075, + "language_loss": 0.72268331, + "learning_rate": 2.1688356707126286e-06, + "loss": 0.74256957, + "num_input_tokens_seen": 87563745, + "step": 4066, + "time_per_iteration": 2.8909084796905518 + }, + { + "auxiliary_loss_clip": 0.01113715, + "auxiliary_loss_mlp": 0.01087473, + "balance_loss_clip": 1.02746725, + "balance_loss_mlp": 1.00684023, + "epoch": 0.48902783622918294, + "flos": 17786088956160.0, + "grad_norm": 1.8010983251109254, + "language_loss": 0.69693857, + "learning_rate": 2.168059466862001e-06, + "loss": 0.71895045, + "num_input_tokens_seen": 87581895, + "step": 4067, + "time_per_iteration": 2.790165662765503 + }, + { + "auxiliary_loss_clip": 0.01121675, + "auxiliary_loss_mlp": 0.0108566, + "balance_loss_clip": 1.02673841, + "balance_loss_mlp": 1.00516987, + "epoch": 0.48914807911982205, + "flos": 22310294590080.0, + "grad_norm": 1.9452832322158935, + "language_loss": 0.81639773, + "learning_rate": 2.167283237516887e-06, + "loss": 0.83847111, + "num_input_tokens_seen": 87600170, + "step": 4068, + "time_per_iteration": 2.8134243488311768 + }, + { + "auxiliary_loss_clip": 0.01123381, + "auxiliary_loss_mlp": 0.01085154, + "balance_loss_clip": 1.02836514, + "balance_loss_mlp": 1.0046165, + "epoch": 0.4892683220104611, + "flos": 16363020954240.0, + "grad_norm": 1.735604183717936, + "language_loss": 0.7494747, + "learning_rate": 2.1665069827950383e-06, + "loss": 0.77156013, + "num_input_tokens_seen": 87617455, + "step": 4069, + "time_per_iteration": 2.7074623107910156 + }, + { + "auxiliary_loss_clip": 0.01124213, + "auxiliary_loss_mlp": 0.01084719, + "balance_loss_clip": 1.02898097, + "balance_loss_mlp": 1.00432479, + "epoch": 0.4893885649011002, + "flos": 15739144606080.0, + "grad_norm": 1.760671615778458, + "language_loss": 0.86794549, + "learning_rate": 2.1657307028142126e-06, + "loss": 0.89003479, + "num_input_tokens_seen": 87634995, + "step": 4070, + "time_per_iteration": 2.726252794265747 + }, + { + "auxiliary_loss_clip": 0.01122911, + "auxiliary_loss_mlp": 0.01085607, + "balance_loss_clip": 1.02924228, + "balance_loss_mlp": 1.0049262, + "epoch": 0.48950880779173933, + "flos": 28581984887040.0, + "grad_norm": 1.842381021872357, + "language_loss": 0.67378539, + "learning_rate": 2.164954397692171e-06, + "loss": 0.69587052, + "num_input_tokens_seen": 87654420, + "step": 4071, + "time_per_iteration": 2.765997886657715 + }, + { + "auxiliary_loss_clip": 0.01111574, + "auxiliary_loss_mlp": 0.01080508, + "balance_loss_clip": 1.03100419, + "balance_loss_mlp": 1.00116181, + "epoch": 0.4896290506823784, + "flos": 66186310746240.0, + "grad_norm": 1.074309236657183, + "language_loss": 0.77317512, + "learning_rate": 2.164178067546678e-06, + "loss": 0.79509592, + "num_input_tokens_seen": 87713585, + "step": 4072, + "time_per_iteration": 3.3036303520202637 + }, + { + "auxiliary_loss_clip": 0.01124061, + "auxiliary_loss_mlp": 0.01085037, + "balance_loss_clip": 1.02784836, + "balance_loss_mlp": 1.0045948, + "epoch": 0.4897492935730175, + "flos": 12531065207040.0, + "grad_norm": 1.70888900498111, + "language_loss": 0.91104555, + "learning_rate": 2.163401712495504e-06, + "loss": 0.93313658, + "num_input_tokens_seen": 87731280, + "step": 4073, + "time_per_iteration": 2.7193055152893066 + }, + { + "auxiliary_loss_clip": 0.01085427, + "auxiliary_loss_mlp": 0.01085405, + "balance_loss_clip": 1.02488708, + "balance_loss_mlp": 1.00486767, + "epoch": 0.4898695364636566, + "flos": 23476816679040.0, + "grad_norm": 1.610908593627063, + "language_loss": 0.79182321, + "learning_rate": 2.1626253326564194e-06, + "loss": 0.81353152, + "num_input_tokens_seen": 87750230, + "step": 4074, + "time_per_iteration": 2.7897887229919434 + }, + { + "auxiliary_loss_clip": 0.01122627, + "auxiliary_loss_mlp": 0.01085168, + "balance_loss_clip": 1.02769983, + "balance_loss_mlp": 1.00453544, + "epoch": 0.48998977935429566, + "flos": 27160209774720.0, + "grad_norm": 1.6339972326860233, + "language_loss": 0.76894855, + "learning_rate": 2.161848928147201e-06, + "loss": 0.79102647, + "num_input_tokens_seen": 87770500, + "step": 4075, + "time_per_iteration": 2.7760446071624756 + }, + { + "auxiliary_loss_clip": 0.01130696, + "auxiliary_loss_mlp": 0.01085416, + "balance_loss_clip": 1.02859116, + "balance_loss_mlp": 1.00487828, + "epoch": 0.4901100222449348, + "flos": 20339588856960.0, + "grad_norm": 1.9354708364150615, + "language_loss": 0.80977243, + "learning_rate": 2.161072499085629e-06, + "loss": 0.83193356, + "num_input_tokens_seen": 87789495, + "step": 4076, + "time_per_iteration": 2.6444547176361084 + }, + { + "auxiliary_loss_clip": 0.01116076, + "auxiliary_loss_mlp": 0.01085166, + "balance_loss_clip": 1.02884007, + "balance_loss_mlp": 1.00462782, + "epoch": 0.4902302651355739, + "flos": 30446359384320.0, + "grad_norm": 1.778847092475032, + "language_loss": 0.82843792, + "learning_rate": 2.160296045589487e-06, + "loss": 0.8504504, + "num_input_tokens_seen": 87812955, + "step": 4077, + "time_per_iteration": 2.918274164199829 + }, + { + "auxiliary_loss_clip": 0.01124354, + "auxiliary_loss_mlp": 0.01084864, + "balance_loss_clip": 1.02695501, + "balance_loss_mlp": 1.00427854, + "epoch": 0.49035050802621294, + "flos": 19174180089600.0, + "grad_norm": 1.8838380705770796, + "language_loss": 0.69731653, + "learning_rate": 2.159519567776562e-06, + "loss": 0.71940869, + "num_input_tokens_seen": 87832605, + "step": 4078, + "time_per_iteration": 2.6838183403015137 + }, + { + "auxiliary_loss_clip": 0.01092613, + "auxiliary_loss_mlp": 0.01085237, + "balance_loss_clip": 1.02862203, + "balance_loss_mlp": 1.00469923, + "epoch": 0.49047075091685205, + "flos": 22228489365120.0, + "grad_norm": 3.0627338087738987, + "language_loss": 0.7062214, + "learning_rate": 2.1587430657646463e-06, + "loss": 0.72799993, + "num_input_tokens_seen": 87846040, + "step": 4079, + "time_per_iteration": 2.770348072052002 + }, + { + "auxiliary_loss_clip": 0.01122603, + "auxiliary_loss_mlp": 0.01085935, + "balance_loss_clip": 1.02908754, + "balance_loss_mlp": 1.00544488, + "epoch": 0.4905909938074911, + "flos": 20156516213760.0, + "grad_norm": 1.7122298076248739, + "language_loss": 0.77984464, + "learning_rate": 2.157966539671533e-06, + "loss": 0.80193007, + "num_input_tokens_seen": 87865680, + "step": 4080, + "time_per_iteration": 2.7281556129455566 + }, + { + "auxiliary_loss_clip": 0.01114901, + "auxiliary_loss_mlp": 0.01085652, + "balance_loss_clip": 1.02840185, + "balance_loss_mlp": 1.00511456, + "epoch": 0.4907112366981302, + "flos": 17202217380480.0, + "grad_norm": 1.975473099521667, + "language_loss": 0.6729399, + "learning_rate": 2.157189989615021e-06, + "loss": 0.69494545, + "num_input_tokens_seen": 87884270, + "step": 4081, + "time_per_iteration": 3.713528871536255 + }, + { + "auxiliary_loss_clip": 0.01135623, + "auxiliary_loss_mlp": 0.00873184, + "balance_loss_clip": 1.03083456, + "balance_loss_mlp": 1.00013506, + "epoch": 0.4908314795887693, + "flos": 21688968107520.0, + "grad_norm": 1.625906226495822, + "language_loss": 0.74873072, + "learning_rate": 2.156413415712913e-06, + "loss": 0.7688188, + "num_input_tokens_seen": 87906320, + "step": 4082, + "time_per_iteration": 2.778400182723999 + }, + { + "auxiliary_loss_clip": 0.01123078, + "auxiliary_loss_mlp": 0.00873139, + "balance_loss_clip": 1.02849126, + "balance_loss_mlp": 1.00016928, + "epoch": 0.4909517224794084, + "flos": 26213676531840.0, + "grad_norm": 1.6969292037866148, + "language_loss": 0.78354049, + "learning_rate": 2.155636818083014e-06, + "loss": 0.80350268, + "num_input_tokens_seen": 87927690, + "step": 4083, + "time_per_iteration": 3.7436327934265137 + }, + { + "auxiliary_loss_clip": 0.01122425, + "auxiliary_loss_mlp": 0.01084441, + "balance_loss_clip": 1.02955937, + "balance_loss_mlp": 1.00414169, + "epoch": 0.4910719653700475, + "flos": 23148377694720.0, + "grad_norm": 1.8356088325555642, + "language_loss": 0.84260976, + "learning_rate": 2.154860196843134e-06, + "loss": 0.86467844, + "num_input_tokens_seen": 87946885, + "step": 4084, + "time_per_iteration": 2.80325984954834 + }, + { + "auxiliary_loss_clip": 0.01141794, + "auxiliary_loss_mlp": 0.01084456, + "balance_loss_clip": 1.02915955, + "balance_loss_mlp": 1.00387049, + "epoch": 0.4911922082606866, + "flos": 23331845387520.0, + "grad_norm": 1.6961207695264144, + "language_loss": 0.7686615, + "learning_rate": 2.154083552111085e-06, + "loss": 0.79092395, + "num_input_tokens_seen": 87966055, + "step": 4085, + "time_per_iteration": 2.6826417446136475 + }, + { + "auxiliary_loss_clip": 0.01141592, + "auxiliary_loss_mlp": 0.01085779, + "balance_loss_clip": 1.02910256, + "balance_loss_mlp": 1.00509858, + "epoch": 0.49131245115132566, + "flos": 29203239542400.0, + "grad_norm": 1.6603409590655354, + "language_loss": 0.81754124, + "learning_rate": 2.1533068840046834e-06, + "loss": 0.8398149, + "num_input_tokens_seen": 87986320, + "step": 4086, + "time_per_iteration": 3.712536096572876 + }, + { + "auxiliary_loss_clip": 0.01122796, + "auxiliary_loss_mlp": 0.00873119, + "balance_loss_clip": 1.02844119, + "balance_loss_mlp": 1.00014973, + "epoch": 0.49143269404196477, + "flos": 20147465986560.0, + "grad_norm": 2.511879892849001, + "language_loss": 0.61625379, + "learning_rate": 2.152530192641749e-06, + "loss": 0.63621294, + "num_input_tokens_seen": 88001230, + "step": 4087, + "time_per_iteration": 2.830521583557129 + }, + { + "auxiliary_loss_clip": 0.01118076, + "auxiliary_loss_mlp": 0.01085518, + "balance_loss_clip": 1.02987123, + "balance_loss_mlp": 1.00502777, + "epoch": 0.4915529369326039, + "flos": 24389809597440.0, + "grad_norm": 1.8107288397331684, + "language_loss": 0.72377777, + "learning_rate": 2.1517534781401068e-06, + "loss": 0.74581373, + "num_input_tokens_seen": 88019110, + "step": 4088, + "time_per_iteration": 3.656733989715576 + }, + { + "auxiliary_loss_clip": 0.01131014, + "auxiliary_loss_mlp": 0.0108657, + "balance_loss_clip": 1.02782571, + "balance_loss_mlp": 1.00607991, + "epoch": 0.49167317982324293, + "flos": 10524305197440.0, + "grad_norm": 2.0541463694381843, + "language_loss": 0.6910429, + "learning_rate": 2.150976740617581e-06, + "loss": 0.71321869, + "num_input_tokens_seen": 88035670, + "step": 4089, + "time_per_iteration": 2.646613836288452 + }, + { + "auxiliary_loss_clip": 0.0112485, + "auxiliary_loss_mlp": 0.01085, + "balance_loss_clip": 1.02965164, + "balance_loss_mlp": 1.00446212, + "epoch": 0.49179342271388204, + "flos": 25593427457280.0, + "grad_norm": 2.3499814073858944, + "language_loss": 0.71574157, + "learning_rate": 2.150199980192006e-06, + "loss": 0.73784006, + "num_input_tokens_seen": 88054790, + "step": 4090, + "time_per_iteration": 2.7854790687561035 + }, + { + "auxiliary_loss_clip": 0.01117234, + "auxiliary_loss_mlp": 0.01084729, + "balance_loss_clip": 1.02740145, + "balance_loss_mlp": 1.00433421, + "epoch": 0.49191366560452116, + "flos": 21102043875840.0, + "grad_norm": 1.9067195997298294, + "language_loss": 0.80766982, + "learning_rate": 2.1494231969812114e-06, + "loss": 0.8296895, + "num_input_tokens_seen": 88073780, + "step": 4091, + "time_per_iteration": 2.7232630252838135 + }, + { + "auxiliary_loss_clip": 0.01114151, + "auxiliary_loss_mlp": 0.01084776, + "balance_loss_clip": 1.02795887, + "balance_loss_mlp": 1.00433397, + "epoch": 0.4920339084951602, + "flos": 26067520091520.0, + "grad_norm": 1.993744977137475, + "language_loss": 0.81223071, + "learning_rate": 2.1486463911030372e-06, + "loss": 0.83421993, + "num_input_tokens_seen": 88094430, + "step": 4092, + "time_per_iteration": 2.8113512992858887 + }, + { + "auxiliary_loss_clip": 0.01121198, + "auxiliary_loss_mlp": 0.01084996, + "balance_loss_clip": 1.02697968, + "balance_loss_mlp": 1.00445867, + "epoch": 0.4921541513857993, + "flos": 25081269384960.0, + "grad_norm": 1.6775213365322552, + "language_loss": 0.74439037, + "learning_rate": 2.147869562675324e-06, + "loss": 0.76645231, + "num_input_tokens_seen": 88113400, + "step": 4093, + "time_per_iteration": 2.7461256980895996 + }, + { + "auxiliary_loss_clip": 0.01131075, + "auxiliary_loss_mlp": 0.01085338, + "balance_loss_clip": 1.02808666, + "balance_loss_mlp": 1.00465763, + "epoch": 0.49227439427643843, + "flos": 24389809597440.0, + "grad_norm": 1.573320892200575, + "language_loss": 0.72807807, + "learning_rate": 2.147092711815915e-06, + "loss": 0.75024217, + "num_input_tokens_seen": 88132750, + "step": 4094, + "time_per_iteration": 2.719287633895874 + }, + { + "auxiliary_loss_clip": 0.01111673, + "auxiliary_loss_mlp": 0.0108505, + "balance_loss_clip": 1.02677178, + "balance_loss_mlp": 1.00456023, + "epoch": 0.4923946371670775, + "flos": 11363753018880.0, + "grad_norm": 2.387666788025062, + "language_loss": 0.86931366, + "learning_rate": 2.1463158386426593e-06, + "loss": 0.89128083, + "num_input_tokens_seen": 88150560, + "step": 4095, + "time_per_iteration": 2.7290403842926025 + }, + { + "auxiliary_loss_clip": 0.01121909, + "auxiliary_loss_mlp": 0.01084188, + "balance_loss_clip": 1.02705395, + "balance_loss_mlp": 1.00355458, + "epoch": 0.4925148800577166, + "flos": 30445964334720.0, + "grad_norm": 2.1405101059282496, + "language_loss": 0.77710479, + "learning_rate": 2.145538943273407e-06, + "loss": 0.79916573, + "num_input_tokens_seen": 88170835, + "step": 4096, + "time_per_iteration": 2.7779574394226074 + }, + { + "auxiliary_loss_clip": 0.01143107, + "auxiliary_loss_mlp": 0.0108564, + "balance_loss_clip": 1.03087151, + "balance_loss_mlp": 1.00514984, + "epoch": 0.49263512294835565, + "flos": 20850454039680.0, + "grad_norm": 1.7364464402071718, + "language_loss": 0.71796179, + "learning_rate": 2.144762025826013e-06, + "loss": 0.74024922, + "num_input_tokens_seen": 88189925, + "step": 4097, + "time_per_iteration": 2.687973737716675 + }, + { + "auxiliary_loss_clip": 0.01133982, + "auxiliary_loss_mlp": 0.01084701, + "balance_loss_clip": 1.03014374, + "balance_loss_mlp": 1.00411534, + "epoch": 0.49275536583899476, + "flos": 23767477534080.0, + "grad_norm": 1.8513261477191747, + "language_loss": 0.8622793, + "learning_rate": 2.143985086418334e-06, + "loss": 0.88446617, + "num_input_tokens_seen": 88205105, + "step": 4098, + "time_per_iteration": 2.7228591442108154 + }, + { + "auxiliary_loss_clip": 0.01123251, + "auxiliary_loss_mlp": 0.01084748, + "balance_loss_clip": 1.02810025, + "balance_loss_mlp": 1.00440097, + "epoch": 0.4928756087296339, + "flos": 22273522041600.0, + "grad_norm": 1.4604270534805046, + "language_loss": 0.76650298, + "learning_rate": 2.1432081251682324e-06, + "loss": 0.78858292, + "num_input_tokens_seen": 88225475, + "step": 4099, + "time_per_iteration": 2.8576090335845947 + }, + { + "auxiliary_loss_clip": 0.01127596, + "auxiliary_loss_mlp": 0.01085257, + "balance_loss_clip": 1.02649808, + "balance_loss_mlp": 1.00481486, + "epoch": 0.49299585162027293, + "flos": 19645471463040.0, + "grad_norm": 1.7458476183666527, + "language_loss": 0.8742705, + "learning_rate": 2.142431142193572e-06, + "loss": 0.89639902, + "num_input_tokens_seen": 88243255, + "step": 4100, + "time_per_iteration": 2.7190184593200684 + }, + { + "auxiliary_loss_clip": 0.01143272, + "auxiliary_loss_mlp": 0.01085964, + "balance_loss_clip": 1.03116155, + "balance_loss_mlp": 1.00547361, + "epoch": 0.49311609451091204, + "flos": 38837138497920.0, + "grad_norm": 2.1231470822934835, + "language_loss": 0.71879733, + "learning_rate": 2.1416541376122207e-06, + "loss": 0.7410897, + "num_input_tokens_seen": 88263435, + "step": 4101, + "time_per_iteration": 2.802690029144287 + }, + { + "auxiliary_loss_clip": 0.0114127, + "auxiliary_loss_mlp": 0.01085908, + "balance_loss_clip": 1.02936292, + "balance_loss_mlp": 1.00537002, + "epoch": 0.49323633740155115, + "flos": 28329102161280.0, + "grad_norm": 1.6467149118830113, + "language_loss": 0.73273593, + "learning_rate": 2.1408771115420496e-06, + "loss": 0.75500774, + "num_input_tokens_seen": 88283295, + "step": 4102, + "time_per_iteration": 2.6692259311676025 + }, + { + "auxiliary_loss_clip": 0.01100919, + "auxiliary_loss_mlp": 0.01084637, + "balance_loss_clip": 1.02520275, + "balance_loss_mlp": 1.0042423, + "epoch": 0.4933565802921902, + "flos": 21135584200320.0, + "grad_norm": 1.6969127528804473, + "language_loss": 0.64817202, + "learning_rate": 2.140100064100932e-06, + "loss": 0.67002749, + "num_input_tokens_seen": 88299270, + "step": 4103, + "time_per_iteration": 2.775965452194214 + }, + { + "auxiliary_loss_clip": 0.01131506, + "auxiliary_loss_mlp": 0.01085463, + "balance_loss_clip": 1.0289408, + "balance_loss_mlp": 1.00511599, + "epoch": 0.4934768231828293, + "flos": 18039007595520.0, + "grad_norm": 1.8615572066327415, + "language_loss": 0.75578153, + "learning_rate": 2.139322995406746e-06, + "loss": 0.77795124, + "num_input_tokens_seen": 88316905, + "step": 4104, + "time_per_iteration": 2.622529983520508 + }, + { + "auxiliary_loss_clip": 0.01143357, + "auxiliary_loss_mlp": 0.01086488, + "balance_loss_clip": 1.03110182, + "balance_loss_mlp": 1.00595009, + "epoch": 0.4935970660734684, + "flos": 23469957181440.0, + "grad_norm": 1.8528638785911469, + "language_loss": 0.79637074, + "learning_rate": 2.1385459055773727e-06, + "loss": 0.8186692, + "num_input_tokens_seen": 88335095, + "step": 4105, + "time_per_iteration": 2.6309680938720703 + }, + { + "auxiliary_loss_clip": 0.01105926, + "auxiliary_loss_mlp": 0.00872802, + "balance_loss_clip": 1.02916634, + "balance_loss_mlp": 1.00025034, + "epoch": 0.4937173089641075, + "flos": 64479258840960.0, + "grad_norm": 1.9113070639166074, + "language_loss": 0.73860693, + "learning_rate": 2.137768794730696e-06, + "loss": 0.75839424, + "num_input_tokens_seen": 88358545, + "step": 4106, + "time_per_iteration": 3.230637311935425 + }, + { + "auxiliary_loss_clip": 0.01123218, + "auxiliary_loss_mlp": 0.01085975, + "balance_loss_clip": 1.02907443, + "balance_loss_mlp": 1.00538921, + "epoch": 0.4938375518547466, + "flos": 22346025644160.0, + "grad_norm": 1.7784309858455942, + "language_loss": 0.80613309, + "learning_rate": 2.1369916629846026e-06, + "loss": 0.82822502, + "num_input_tokens_seen": 88378295, + "step": 4107, + "time_per_iteration": 3.582160472869873 + }, + { + "auxiliary_loss_clip": 0.01121185, + "auxiliary_loss_mlp": 0.01085234, + "balance_loss_clip": 1.02690244, + "balance_loss_mlp": 1.00464797, + "epoch": 0.4939577947453857, + "flos": 17858700299520.0, + "grad_norm": 1.7577046452824379, + "language_loss": 0.74872196, + "learning_rate": 2.136214510456983e-06, + "loss": 0.77078617, + "num_input_tokens_seen": 88396750, + "step": 4108, + "time_per_iteration": 3.488668441772461 + }, + { + "auxiliary_loss_clip": 0.01094622, + "auxiliary_loss_mlp": 0.00873106, + "balance_loss_clip": 1.02983844, + "balance_loss_mlp": 1.00215769, + "epoch": 0.49407803763602476, + "flos": 70066746875520.0, + "grad_norm": 0.8829176937837143, + "language_loss": 0.63164067, + "learning_rate": 2.1354373372657296e-06, + "loss": 0.65131795, + "num_input_tokens_seen": 88455190, + "step": 4109, + "time_per_iteration": 3.372694969177246 + }, + { + "auxiliary_loss_clip": 0.01141734, + "auxiliary_loss_mlp": 0.01085442, + "balance_loss_clip": 1.02997911, + "balance_loss_mlp": 1.00499964, + "epoch": 0.49419828052666387, + "flos": 24317485562880.0, + "grad_norm": 1.5640817861004732, + "language_loss": 0.70876014, + "learning_rate": 2.1346601435287404e-06, + "loss": 0.73103189, + "num_input_tokens_seen": 88477460, + "step": 4110, + "time_per_iteration": 2.6552734375 + }, + { + "auxiliary_loss_clip": 0.01122918, + "auxiliary_loss_mlp": 0.01085407, + "balance_loss_clip": 1.02819729, + "balance_loss_mlp": 1.00496423, + "epoch": 0.494318523417303, + "flos": 29386060790400.0, + "grad_norm": 1.6397198381621523, + "language_loss": 0.80259174, + "learning_rate": 2.1338829293639144e-06, + "loss": 0.82467496, + "num_input_tokens_seen": 88497820, + "step": 4111, + "time_per_iteration": 3.6404905319213867 + }, + { + "auxiliary_loss_clip": 0.01098089, + "auxiliary_loss_mlp": 0.01084528, + "balance_loss_clip": 1.02307904, + "balance_loss_mlp": 1.00403786, + "epoch": 0.49443876630794203, + "flos": 15268284195840.0, + "grad_norm": 1.7937498210262948, + "language_loss": 0.82683289, + "learning_rate": 2.1331056948891547e-06, + "loss": 0.84865904, + "num_input_tokens_seen": 88514920, + "step": 4112, + "time_per_iteration": 2.7962329387664795 + }, + { + "auxiliary_loss_clip": 0.01120211, + "auxiliary_loss_mlp": 0.01084969, + "balance_loss_clip": 1.02638674, + "balance_loss_mlp": 1.00447917, + "epoch": 0.49455900919858115, + "flos": 12347453859840.0, + "grad_norm": 2.4239023627278775, + "language_loss": 0.75843209, + "learning_rate": 2.1323284402223666e-06, + "loss": 0.78048384, + "num_input_tokens_seen": 88530910, + "step": 4113, + "time_per_iteration": 3.6430039405822754 + }, + { + "auxiliary_loss_clip": 0.01144368, + "auxiliary_loss_mlp": 0.00872834, + "balance_loss_clip": 1.03277016, + "balance_loss_mlp": 1.0002594, + "epoch": 0.4946792520892202, + "flos": 22779610715520.0, + "grad_norm": 1.7245547224501845, + "language_loss": 0.88164651, + "learning_rate": 2.1315511654814597e-06, + "loss": 0.90181851, + "num_input_tokens_seen": 88549320, + "step": 4114, + "time_per_iteration": 2.7050442695617676 + }, + { + "auxiliary_loss_clip": 0.01118665, + "auxiliary_loss_mlp": 0.01086708, + "balance_loss_clip": 1.0262053, + "balance_loss_mlp": 1.00626588, + "epoch": 0.4947994949798593, + "flos": 23148126299520.0, + "grad_norm": 1.7278827546913214, + "language_loss": 0.78018868, + "learning_rate": 2.1307738707843456e-06, + "loss": 0.80224246, + "num_input_tokens_seen": 88568985, + "step": 4115, + "time_per_iteration": 2.67964506149292 + }, + { + "auxiliary_loss_clip": 0.011345, + "auxiliary_loss_mlp": 0.01085075, + "balance_loss_clip": 1.03060174, + "balance_loss_mlp": 1.00453758, + "epoch": 0.4949197378704984, + "flos": 23659997063040.0, + "grad_norm": 1.8775525606227546, + "language_loss": 0.68885571, + "learning_rate": 2.1299965562489385e-06, + "loss": 0.71105146, + "num_input_tokens_seen": 88588790, + "step": 4116, + "time_per_iteration": 2.751904010772705 + }, + { + "auxiliary_loss_clip": 0.01133179, + "auxiliary_loss_mlp": 0.01085756, + "balance_loss_clip": 1.02842033, + "balance_loss_mlp": 1.00521839, + "epoch": 0.4950399807611375, + "flos": 26911493026560.0, + "grad_norm": 1.4260723526719445, + "language_loss": 0.78778601, + "learning_rate": 2.129219221993158e-06, + "loss": 0.80997539, + "num_input_tokens_seen": 88613575, + "step": 4117, + "time_per_iteration": 2.7479124069213867 + }, + { + "auxiliary_loss_clip": 0.01095475, + "auxiliary_loss_mlp": 0.01078927, + "balance_loss_clip": 1.02276504, + "balance_loss_mlp": 0.99996263, + "epoch": 0.4951602236517766, + "flos": 67315270187520.0, + "grad_norm": 0.8213850034985813, + "language_loss": 0.59951389, + "learning_rate": 2.128441868134924e-06, + "loss": 0.6212579, + "num_input_tokens_seen": 88675510, + "step": 4118, + "time_per_iteration": 3.3687703609466553 + }, + { + "auxiliary_loss_clip": 0.01114543, + "auxiliary_loss_mlp": 0.01084827, + "balance_loss_clip": 1.02800059, + "balance_loss_mlp": 1.00438428, + "epoch": 0.4952804665424157, + "flos": 19901442758400.0, + "grad_norm": 1.9895644202459584, + "language_loss": 0.83167404, + "learning_rate": 2.1276644947921606e-06, + "loss": 0.85366774, + "num_input_tokens_seen": 88694425, + "step": 4119, + "time_per_iteration": 2.791943311691284 + }, + { + "auxiliary_loss_clip": 0.01129028, + "auxiliary_loss_mlp": 0.01084651, + "balance_loss_clip": 1.02687836, + "balance_loss_mlp": 1.00401747, + "epoch": 0.49540070943305475, + "flos": 18806813740800.0, + "grad_norm": 2.0686590023178377, + "language_loss": 0.82375044, + "learning_rate": 2.126887102082795e-06, + "loss": 0.84588724, + "num_input_tokens_seen": 88714450, + "step": 4120, + "time_per_iteration": 2.6662447452545166 + }, + { + "auxiliary_loss_clip": 0.01113788, + "auxiliary_loss_mlp": 0.01086367, + "balance_loss_clip": 1.02756739, + "balance_loss_mlp": 1.00578213, + "epoch": 0.49552095232369386, + "flos": 24934179191040.0, + "grad_norm": 1.6086754108499568, + "language_loss": 0.7056731, + "learning_rate": 2.126109690124757e-06, + "loss": 0.7276746, + "num_input_tokens_seen": 88735265, + "step": 4121, + "time_per_iteration": 2.8281936645507812 + }, + { + "auxiliary_loss_clip": 0.01103573, + "auxiliary_loss_mlp": 0.010863, + "balance_loss_clip": 1.02696157, + "balance_loss_mlp": 1.00571465, + "epoch": 0.495641195214333, + "flos": 22857249962880.0, + "grad_norm": 2.0586329532569994, + "language_loss": 0.71052492, + "learning_rate": 2.1253322590359786e-06, + "loss": 0.73242366, + "num_input_tokens_seen": 88754600, + "step": 4122, + "time_per_iteration": 2.8200747966766357 + }, + { + "auxiliary_loss_clip": 0.01133813, + "auxiliary_loss_mlp": 0.01087326, + "balance_loss_clip": 1.02980113, + "balance_loss_mlp": 1.00674009, + "epoch": 0.49576143810497203, + "flos": 25769748343680.0, + "grad_norm": 2.3312661177551597, + "language_loss": 0.7372539, + "learning_rate": 2.124554808934397e-06, + "loss": 0.75946522, + "num_input_tokens_seen": 88775180, + "step": 4123, + "time_per_iteration": 2.7817177772521973 + }, + { + "auxiliary_loss_clip": 0.01108094, + "auxiliary_loss_mlp": 0.01085492, + "balance_loss_clip": 1.02846515, + "balance_loss_mlp": 1.00495458, + "epoch": 0.49588168099561114, + "flos": 22128838058880.0, + "grad_norm": 1.797604405063701, + "language_loss": 0.72899079, + "learning_rate": 2.1237773399379496e-06, + "loss": 0.75092661, + "num_input_tokens_seen": 88796145, + "step": 4124, + "time_per_iteration": 2.8266313076019287 + }, + { + "auxiliary_loss_clip": 0.01109608, + "auxiliary_loss_mlp": 0.01084648, + "balance_loss_clip": 1.02929664, + "balance_loss_mlp": 1.00406265, + "epoch": 0.49600192388625025, + "flos": 24387331559040.0, + "grad_norm": 1.626619710784234, + "language_loss": 0.86910939, + "learning_rate": 2.122999852164578e-06, + "loss": 0.89105195, + "num_input_tokens_seen": 88816765, + "step": 4125, + "time_per_iteration": 2.7931466102600098 + }, + { + "auxiliary_loss_clip": 0.01090234, + "auxiliary_loss_mlp": 0.01085896, + "balance_loss_clip": 1.02928865, + "balance_loss_mlp": 1.00535858, + "epoch": 0.4961221667768893, + "flos": 22857429530880.0, + "grad_norm": 2.1323200962897952, + "language_loss": 0.58991486, + "learning_rate": 2.122222345732227e-06, + "loss": 0.61167622, + "num_input_tokens_seen": 88836680, + "step": 4126, + "time_per_iteration": 2.884068727493286 + }, + { + "auxiliary_loss_clip": 0.01114453, + "auxiliary_loss_mlp": 0.01086991, + "balance_loss_clip": 1.02782559, + "balance_loss_mlp": 1.00635791, + "epoch": 0.4962424096675284, + "flos": 17858089768320.0, + "grad_norm": 1.636039819194729, + "language_loss": 0.82821381, + "learning_rate": 2.121444820758843e-06, + "loss": 0.85022819, + "num_input_tokens_seen": 88855320, + "step": 4127, + "time_per_iteration": 2.7673041820526123 + }, + { + "auxiliary_loss_clip": 0.01095871, + "auxiliary_loss_mlp": 0.01085391, + "balance_loss_clip": 1.02491689, + "balance_loss_mlp": 1.00480533, + "epoch": 0.49636265255816747, + "flos": 21793611404160.0, + "grad_norm": 1.9333479717040232, + "language_loss": 0.78864086, + "learning_rate": 2.120667277362376e-06, + "loss": 0.81045347, + "num_input_tokens_seen": 88874035, + "step": 4128, + "time_per_iteration": 2.8272485733032227 + }, + { + "auxiliary_loss_clip": 0.01143555, + "auxiliary_loss_mlp": 0.01085994, + "balance_loss_clip": 1.03105593, + "balance_loss_mlp": 1.00545669, + "epoch": 0.4964828954488066, + "flos": 16358603581440.0, + "grad_norm": 1.8742946926791018, + "language_loss": 0.84945703, + "learning_rate": 2.1198897156607796e-06, + "loss": 0.8717525, + "num_input_tokens_seen": 88891390, + "step": 4129, + "time_per_iteration": 2.6160881519317627 + }, + { + "auxiliary_loss_clip": 0.01119864, + "auxiliary_loss_mlp": 0.01085274, + "balance_loss_clip": 1.03118122, + "balance_loss_mlp": 1.00478351, + "epoch": 0.4966031383394457, + "flos": 24711101775360.0, + "grad_norm": 1.7850091811855624, + "language_loss": 0.74173278, + "learning_rate": 2.1191121357720085e-06, + "loss": 0.76378417, + "num_input_tokens_seen": 88909450, + "step": 4130, + "time_per_iteration": 2.6724324226379395 + }, + { + "auxiliary_loss_clip": 0.01101157, + "auxiliary_loss_mlp": 0.01085318, + "balance_loss_clip": 1.02502692, + "balance_loss_mlp": 1.00473285, + "epoch": 0.49672338123008475, + "flos": 22930615491840.0, + "grad_norm": 2.1168305544862935, + "language_loss": 0.7463553, + "learning_rate": 2.1183345378140206e-06, + "loss": 0.76822007, + "num_input_tokens_seen": 88929195, + "step": 4131, + "time_per_iteration": 2.844531297683716 + }, + { + "auxiliary_loss_clip": 0.01118349, + "auxiliary_loss_mlp": 0.01080583, + "balance_loss_clip": 1.02926636, + "balance_loss_mlp": 1.00123775, + "epoch": 0.49684362412072386, + "flos": 65976736844160.0, + "grad_norm": 0.871955391448119, + "language_loss": 0.61985141, + "learning_rate": 2.1175569219047783e-06, + "loss": 0.64184076, + "num_input_tokens_seen": 88990635, + "step": 4132, + "time_per_iteration": 4.24664044380188 + }, + { + "auxiliary_loss_clip": 0.0114239, + "auxiliary_loss_mlp": 0.0108473, + "balance_loss_clip": 1.03003681, + "balance_loss_mlp": 1.00423956, + "epoch": 0.49696386701136297, + "flos": 19971288754560.0, + "grad_norm": 1.4971983583340047, + "language_loss": 0.73465228, + "learning_rate": 2.1167792881622437e-06, + "loss": 0.7569235, + "num_input_tokens_seen": 89009655, + "step": 4133, + "time_per_iteration": 3.5991084575653076 + }, + { + "auxiliary_loss_clip": 0.01120042, + "auxiliary_loss_mlp": 0.01085819, + "balance_loss_clip": 1.02686596, + "balance_loss_mlp": 1.00537682, + "epoch": 0.497084109902002, + "flos": 24750819239040.0, + "grad_norm": 1.4996433131006877, + "language_loss": 0.81155771, + "learning_rate": 2.116001636704384e-06, + "loss": 0.83361626, + "num_input_tokens_seen": 89030040, + "step": 4134, + "time_per_iteration": 2.7298777103424072 + }, + { + "auxiliary_loss_clip": 0.01093574, + "auxiliary_loss_mlp": 0.0108669, + "balance_loss_clip": 1.03027833, + "balance_loss_mlp": 1.00615168, + "epoch": 0.49720435279264114, + "flos": 21871825269120.0, + "grad_norm": 3.6471160079276803, + "language_loss": 0.80316651, + "learning_rate": 2.1152239676491685e-06, + "loss": 0.82496905, + "num_input_tokens_seen": 89048145, + "step": 4135, + "time_per_iteration": 2.7921621799468994 + }, + { + "auxiliary_loss_clip": 0.01124615, + "auxiliary_loss_mlp": 0.01085686, + "balance_loss_clip": 1.02837086, + "balance_loss_mlp": 1.00529158, + "epoch": 0.49732459568328025, + "flos": 23805794367360.0, + "grad_norm": 1.9593005293784764, + "language_loss": 0.73145771, + "learning_rate": 2.114446281114569e-06, + "loss": 0.75356066, + "num_input_tokens_seen": 89067165, + "step": 4136, + "time_per_iteration": 3.722289800643921 + }, + { + "auxiliary_loss_clip": 0.01123271, + "auxiliary_loss_mlp": 0.0108524, + "balance_loss_clip": 1.02888894, + "balance_loss_mlp": 1.00479722, + "epoch": 0.4974448385739193, + "flos": 20047742853120.0, + "grad_norm": 2.3538157231601686, + "language_loss": 0.75913829, + "learning_rate": 2.1136685772185587e-06, + "loss": 0.78122342, + "num_input_tokens_seen": 89086190, + "step": 4137, + "time_per_iteration": 2.7427079677581787 + }, + { + "auxiliary_loss_clip": 0.01124535, + "auxiliary_loss_mlp": 0.00872981, + "balance_loss_clip": 1.02797842, + "balance_loss_mlp": 1.00008345, + "epoch": 0.4975650814645584, + "flos": 24821347593600.0, + "grad_norm": 1.6803438426754875, + "language_loss": 0.77956355, + "learning_rate": 2.1128908560791163e-06, + "loss": 0.79953867, + "num_input_tokens_seen": 89106020, + "step": 4138, + "time_per_iteration": 2.83609676361084 + }, + { + "auxiliary_loss_clip": 0.01143857, + "auxiliary_loss_mlp": 0.0108509, + "balance_loss_clip": 1.03146863, + "balance_loss_mlp": 1.00459933, + "epoch": 0.4976853243551975, + "flos": 19829477859840.0, + "grad_norm": 2.0920106036559742, + "language_loss": 0.78236163, + "learning_rate": 2.1121131178142203e-06, + "loss": 0.80465114, + "num_input_tokens_seen": 89125385, + "step": 4139, + "time_per_iteration": 3.6076455116271973 + }, + { + "auxiliary_loss_clip": 0.01120724, + "auxiliary_loss_mlp": 0.01084741, + "balance_loss_clip": 1.0260253, + "balance_loss_mlp": 1.00434661, + "epoch": 0.4978055672458366, + "flos": 23142990654720.0, + "grad_norm": 1.4449897673656418, + "language_loss": 0.82431793, + "learning_rate": 2.1113353625418544e-06, + "loss": 0.8463726, + "num_input_tokens_seen": 89143935, + "step": 4140, + "time_per_iteration": 2.742129325866699 + }, + { + "auxiliary_loss_clip": 0.01132667, + "auxiliary_loss_mlp": 0.01085838, + "balance_loss_clip": 1.03078747, + "balance_loss_mlp": 1.00553906, + "epoch": 0.4979258101364757, + "flos": 15559914718080.0, + "grad_norm": 1.6225615276952894, + "language_loss": 0.79104149, + "learning_rate": 2.1105575903800017e-06, + "loss": 0.81322658, + "num_input_tokens_seen": 89162655, + "step": 4141, + "time_per_iteration": 2.8002281188964844 + }, + { + "auxiliary_loss_clip": 0.01116945, + "auxiliary_loss_mlp": 0.01086277, + "balance_loss_clip": 1.02866983, + "balance_loss_mlp": 1.00573909, + "epoch": 0.4980460530271148, + "flos": 26356169784960.0, + "grad_norm": 1.9191967312315685, + "language_loss": 0.85222316, + "learning_rate": 2.1097798014466502e-06, + "loss": 0.87425536, + "num_input_tokens_seen": 89182255, + "step": 4142, + "time_per_iteration": 2.7001054286956787 + }, + { + "auxiliary_loss_clip": 0.01132572, + "auxiliary_loss_mlp": 0.01085844, + "balance_loss_clip": 1.02818191, + "balance_loss_mlp": 1.00516284, + "epoch": 0.49816629591775385, + "flos": 17274541415040.0, + "grad_norm": 2.64174688250784, + "language_loss": 0.59168935, + "learning_rate": 2.109001995859791e-06, + "loss": 0.61387348, + "num_input_tokens_seen": 89201155, + "step": 4143, + "time_per_iteration": 2.675872325897217 + }, + { + "auxiliary_loss_clip": 0.01107682, + "auxiliary_loss_mlp": 0.01080382, + "balance_loss_clip": 1.02761388, + "balance_loss_mlp": 1.00103593, + "epoch": 0.49828653880839296, + "flos": 64930947344640.0, + "grad_norm": 0.797813441267085, + "language_loss": 0.60090613, + "learning_rate": 2.108224173737415e-06, + "loss": 0.62278676, + "num_input_tokens_seen": 89264455, + "step": 4144, + "time_per_iteration": 3.2750117778778076 + }, + { + "auxiliary_loss_clip": 0.01115221, + "auxiliary_loss_mlp": 0.01085228, + "balance_loss_clip": 1.02607334, + "balance_loss_mlp": 1.0045948, + "epoch": 0.498406781699032, + "flos": 27484806003840.0, + "grad_norm": 2.714847831327281, + "language_loss": 0.76664245, + "learning_rate": 2.1074463351975183e-06, + "loss": 0.78864694, + "num_input_tokens_seen": 89283340, + "step": 4145, + "time_per_iteration": 2.7712552547454834 + }, + { + "auxiliary_loss_clip": 0.01097961, + "auxiliary_loss_mlp": 0.01085291, + "balance_loss_clip": 1.02741647, + "balance_loss_mlp": 1.00484896, + "epoch": 0.49852702458967113, + "flos": 31499870307840.0, + "grad_norm": 1.740865495360276, + "language_loss": 0.71862632, + "learning_rate": 2.106668480358098e-06, + "loss": 0.74045885, + "num_input_tokens_seen": 89303565, + "step": 4146, + "time_per_iteration": 2.8082737922668457 + }, + { + "auxiliary_loss_clip": 0.01101586, + "auxiliary_loss_mlp": 0.01085467, + "balance_loss_clip": 1.02913916, + "balance_loss_mlp": 1.00473869, + "epoch": 0.49864726748031024, + "flos": 22852868503680.0, + "grad_norm": 3.582131963578028, + "language_loss": 0.70994282, + "learning_rate": 2.105890609337154e-06, + "loss": 0.73181337, + "num_input_tokens_seen": 89322080, + "step": 4147, + "time_per_iteration": 2.817328691482544 + }, + { + "auxiliary_loss_clip": 0.01124456, + "auxiliary_loss_mlp": 0.01079809, + "balance_loss_clip": 1.02756262, + "balance_loss_mlp": 1.00046337, + "epoch": 0.4987675103709493, + "flos": 70405708544640.0, + "grad_norm": 0.6907429637822271, + "language_loss": 0.63797313, + "learning_rate": 2.1051127222526883e-06, + "loss": 0.66001576, + "num_input_tokens_seen": 89394195, + "step": 4148, + "time_per_iteration": 3.2643556594848633 + }, + { + "auxiliary_loss_clip": 0.01131402, + "auxiliary_loss_mlp": 0.01084213, + "balance_loss_clip": 1.02880824, + "balance_loss_mlp": 1.00386548, + "epoch": 0.4988877532615884, + "flos": 28767571482240.0, + "grad_norm": 1.7317013904259837, + "language_loss": 0.8099581, + "learning_rate": 2.1043348192227067e-06, + "loss": 0.83211422, + "num_input_tokens_seen": 89414565, + "step": 4149, + "time_per_iteration": 2.744800090789795 + }, + { + "auxiliary_loss_clip": 0.01112537, + "auxiliary_loss_mlp": 0.01085976, + "balance_loss_clip": 1.02680683, + "balance_loss_mlp": 1.00558078, + "epoch": 0.4990079961522275, + "flos": 16872700988160.0, + "grad_norm": 1.758183522576919, + "language_loss": 0.61491489, + "learning_rate": 2.1035569003652156e-06, + "loss": 0.63690007, + "num_input_tokens_seen": 89433195, + "step": 4150, + "time_per_iteration": 2.749551296234131 + }, + { + "auxiliary_loss_clip": 0.01105056, + "auxiliary_loss_mlp": 0.01086538, + "balance_loss_clip": 1.02737832, + "balance_loss_mlp": 1.00580966, + "epoch": 0.4991282390428666, + "flos": 13291042187520.0, + "grad_norm": 1.9385020049839656, + "language_loss": 0.81592357, + "learning_rate": 2.1027789657982255e-06, + "loss": 0.83783954, + "num_input_tokens_seen": 89447410, + "step": 4151, + "time_per_iteration": 2.7909786701202393 + }, + { + "auxiliary_loss_clip": 0.01085036, + "auxiliary_loss_mlp": 0.01085323, + "balance_loss_clip": 1.02528143, + "balance_loss_mlp": 1.00478482, + "epoch": 0.4992484819335057, + "flos": 21537496454400.0, + "grad_norm": 3.050514670000472, + "language_loss": 0.77257431, + "learning_rate": 2.1020010156397482e-06, + "loss": 0.79427791, + "num_input_tokens_seen": 89464630, + "step": 4152, + "time_per_iteration": 2.888343095779419 + }, + { + "auxiliary_loss_clip": 0.01133819, + "auxiliary_loss_mlp": 0.01083917, + "balance_loss_clip": 1.030092, + "balance_loss_mlp": 1.00342727, + "epoch": 0.4993687248241448, + "flos": 24860095390080.0, + "grad_norm": 1.9026850528550563, + "language_loss": 0.77322572, + "learning_rate": 2.101223050007797e-06, + "loss": 0.79540312, + "num_input_tokens_seen": 89483180, + "step": 4153, + "time_per_iteration": 2.755502700805664 + }, + { + "auxiliary_loss_clip": 0.01123833, + "auxiliary_loss_mlp": 0.01079374, + "balance_loss_clip": 1.02687597, + "balance_loss_mlp": 1.00002789, + "epoch": 0.49948896771478385, + "flos": 62941602453120.0, + "grad_norm": 0.8200353783074423, + "language_loss": 0.53802729, + "learning_rate": 2.1004450690203904e-06, + "loss": 0.56005943, + "num_input_tokens_seen": 89539260, + "step": 4154, + "time_per_iteration": 3.2636566162109375 + }, + { + "auxiliary_loss_clip": 0.01123865, + "auxiliary_loss_mlp": 0.01079575, + "balance_loss_clip": 1.02695024, + "balance_loss_mlp": 1.00022972, + "epoch": 0.49960921060542296, + "flos": 68284213516800.0, + "grad_norm": 0.8532908360000612, + "language_loss": 0.63337076, + "learning_rate": 2.099667072795546e-06, + "loss": 0.65540516, + "num_input_tokens_seen": 89601380, + "step": 4155, + "time_per_iteration": 3.2820117473602295 + }, + { + "auxiliary_loss_clip": 0.01130053, + "auxiliary_loss_mlp": 0.01085092, + "balance_loss_clip": 1.02728629, + "balance_loss_mlp": 1.00460172, + "epoch": 0.49972945349606207, + "flos": 23659350618240.0, + "grad_norm": 1.7956074444740984, + "language_loss": 0.79762828, + "learning_rate": 2.0988890614512864e-06, + "loss": 0.81977975, + "num_input_tokens_seen": 89621270, + "step": 4156, + "time_per_iteration": 2.6839759349823 + }, + { + "auxiliary_loss_clip": 0.01123478, + "auxiliary_loss_mlp": 0.01084963, + "balance_loss_clip": 1.02925038, + "balance_loss_mlp": 1.00447321, + "epoch": 0.4998496963867011, + "flos": 19755825022080.0, + "grad_norm": 1.7976736155368669, + "language_loss": 0.8405, + "learning_rate": 2.098111035105635e-06, + "loss": 0.86258447, + "num_input_tokens_seen": 89639695, + "step": 4157, + "time_per_iteration": 2.780592679977417 + }, + { + "auxiliary_loss_clip": 0.0107884, + "auxiliary_loss_mlp": 0.01085608, + "balance_loss_clip": 1.02578509, + "balance_loss_mlp": 1.00502276, + "epoch": 0.49996993927734024, + "flos": 22265728790400.0, + "grad_norm": 1.5723242611258033, + "language_loss": 0.73160338, + "learning_rate": 2.0973329938766176e-06, + "loss": 0.75324792, + "num_input_tokens_seen": 89657125, + "step": 4158, + "time_per_iteration": 3.6416256427764893 + }, + { + "auxiliary_loss_clip": 0.01135021, + "auxiliary_loss_mlp": 0.01085357, + "balance_loss_clip": 1.03053188, + "balance_loss_mlp": 1.00467658, + "epoch": 0.5000901821679793, + "flos": 23327212533120.0, + "grad_norm": 2.0758392629505447, + "language_loss": 0.78278947, + "learning_rate": 2.0965549378822618e-06, + "loss": 0.80499327, + "num_input_tokens_seen": 89678415, + "step": 4159, + "time_per_iteration": 3.5508792400360107 + }, + { + "auxiliary_loss_clip": 0.01060411, + "auxiliary_loss_mlp": 0.0108486, + "balance_loss_clip": 1.02525532, + "balance_loss_mlp": 1.00427461, + "epoch": 0.5002104250586185, + "flos": 20339014239360.0, + "grad_norm": 1.865082579784144, + "language_loss": 0.8387686, + "learning_rate": 2.095776867240599e-06, + "loss": 0.86022133, + "num_input_tokens_seen": 89695405, + "step": 4160, + "time_per_iteration": 3.1058895587921143 + }, + { + "auxiliary_loss_clip": 0.01113985, + "auxiliary_loss_mlp": 0.01085823, + "balance_loss_clip": 1.02780652, + "balance_loss_mlp": 1.00528502, + "epoch": 0.5003306679492575, + "flos": 13991372634240.0, + "grad_norm": 1.8630649144183493, + "language_loss": 0.82402945, + "learning_rate": 2.094998782069661e-06, + "loss": 0.84602749, + "num_input_tokens_seen": 89713110, + "step": 4161, + "time_per_iteration": 3.931166648864746 + }, + { + "auxiliary_loss_clip": 0.01141983, + "auxiliary_loss_mlp": 0.01086171, + "balance_loss_clip": 1.02991736, + "balance_loss_mlp": 1.005633, + "epoch": 0.5004509108398966, + "flos": 27672762896640.0, + "grad_norm": 1.7939551257496618, + "language_loss": 0.75901902, + "learning_rate": 2.0942206824874845e-06, + "loss": 0.78130054, + "num_input_tokens_seen": 89735885, + "step": 4162, + "time_per_iteration": 2.702282428741455 + }, + { + "auxiliary_loss_clip": 0.01126469, + "auxiliary_loss_mlp": 0.01084578, + "balance_loss_clip": 1.02498686, + "balance_loss_mlp": 1.00394535, + "epoch": 0.5005711537305357, + "flos": 14976186796800.0, + "grad_norm": 2.0097088082573533, + "language_loss": 0.79110342, + "learning_rate": 2.093442568612105e-06, + "loss": 0.81321388, + "num_input_tokens_seen": 89753690, + "step": 4163, + "time_per_iteration": 2.669832229614258 + }, + { + "auxiliary_loss_clip": 0.01141137, + "auxiliary_loss_mlp": 0.01084748, + "balance_loss_clip": 1.02862763, + "balance_loss_mlp": 1.00421071, + "epoch": 0.5006913966211748, + "flos": 26503259978880.0, + "grad_norm": 1.465093319670738, + "language_loss": 0.8524431, + "learning_rate": 2.0926644405615613e-06, + "loss": 0.87470198, + "num_input_tokens_seen": 89774590, + "step": 4164, + "time_per_iteration": 3.7590627670288086 + }, + { + "auxiliary_loss_clip": 0.01104345, + "auxiliary_loss_mlp": 0.01084548, + "balance_loss_clip": 1.0251615, + "balance_loss_mlp": 1.00405765, + "epoch": 0.5008116395118138, + "flos": 20449295971200.0, + "grad_norm": 1.7380442250583494, + "language_loss": 0.81355608, + "learning_rate": 2.091886298453897e-06, + "loss": 0.83544499, + "num_input_tokens_seen": 89792775, + "step": 4165, + "time_per_iteration": 2.710031032562256 + }, + { + "auxiliary_loss_clip": 0.01133461, + "auxiliary_loss_mlp": 0.01083841, + "balance_loss_clip": 1.02985144, + "balance_loss_mlp": 1.0033505, + "epoch": 0.500931882402453, + "flos": 21579871524480.0, + "grad_norm": 1.8536499832019553, + "language_loss": 0.72628641, + "learning_rate": 2.091108142407153e-06, + "loss": 0.7484594, + "num_input_tokens_seen": 89811515, + "step": 4166, + "time_per_iteration": 2.7477779388427734 + }, + { + "auxiliary_loss_clip": 0.01097803, + "auxiliary_loss_mlp": 0.0108099, + "balance_loss_clip": 1.01768768, + "balance_loss_mlp": 1.00164402, + "epoch": 0.5010521252930921, + "flos": 57785011925760.0, + "grad_norm": 0.8416162456780678, + "language_loss": 0.62468398, + "learning_rate": 2.090329972539377e-06, + "loss": 0.64647186, + "num_input_tokens_seen": 89870080, + "step": 4167, + "time_per_iteration": 3.2706055641174316 + }, + { + "auxiliary_loss_clip": 0.01071077, + "auxiliary_loss_mlp": 0.01084507, + "balance_loss_clip": 1.02231932, + "balance_loss_mlp": 1.00411177, + "epoch": 0.5011723681837311, + "flos": 18625500864000.0, + "grad_norm": 2.358461793602839, + "language_loss": 0.68518364, + "learning_rate": 2.089551788968616e-06, + "loss": 0.70673949, + "num_input_tokens_seen": 89888045, + "step": 4168, + "time_per_iteration": 2.9496490955352783 + }, + { + "auxiliary_loss_clip": 0.01123565, + "auxiliary_loss_mlp": 0.01079439, + "balance_loss_clip": 1.02667129, + "balance_loss_mlp": 1.00009298, + "epoch": 0.5012926110743702, + "flos": 55883146608000.0, + "grad_norm": 0.8418078261347427, + "language_loss": 0.61139774, + "learning_rate": 2.08877359181292e-06, + "loss": 0.63342786, + "num_input_tokens_seen": 89944610, + "step": 4169, + "time_per_iteration": 3.177284002304077 + }, + { + "auxiliary_loss_clip": 0.01115813, + "auxiliary_loss_mlp": 0.01084354, + "balance_loss_clip": 1.02813983, + "balance_loss_mlp": 1.00381649, + "epoch": 0.5014128539650093, + "flos": 24238266117120.0, + "grad_norm": 7.630605556379206, + "language_loss": 0.85436511, + "learning_rate": 2.0879953811903396e-06, + "loss": 0.87636685, + "num_input_tokens_seen": 89959495, + "step": 4170, + "time_per_iteration": 2.763780117034912 + }, + { + "auxiliary_loss_clip": 0.01130164, + "auxiliary_loss_mlp": 0.01086009, + "balance_loss_clip": 1.02777708, + "balance_loss_mlp": 1.00532806, + "epoch": 0.5015330968556484, + "flos": 27527468382720.0, + "grad_norm": 1.7229601723116303, + "language_loss": 0.78778183, + "learning_rate": 2.08721715721893e-06, + "loss": 0.80994356, + "num_input_tokens_seen": 89978820, + "step": 4171, + "time_per_iteration": 2.7633087635040283 + }, + { + "auxiliary_loss_clip": 0.0113214, + "auxiliary_loss_mlp": 0.01086808, + "balance_loss_clip": 1.0289911, + "balance_loss_mlp": 1.00617445, + "epoch": 0.5016533397462875, + "flos": 23800802376960.0, + "grad_norm": 1.8664364971680756, + "language_loss": 0.7729165, + "learning_rate": 2.0864389200167477e-06, + "loss": 0.79510593, + "num_input_tokens_seen": 89997075, + "step": 4172, + "time_per_iteration": 2.753380060195923 + }, + { + "auxiliary_loss_clip": 0.01135652, + "auxiliary_loss_mlp": 0.00873065, + "balance_loss_clip": 1.03147089, + "balance_loss_mlp": 1.00015783, + "epoch": 0.5017735826369266, + "flos": 25295009264640.0, + "grad_norm": 1.7396416443385305, + "language_loss": 0.78686613, + "learning_rate": 2.0856606697018504e-06, + "loss": 0.80695331, + "num_input_tokens_seen": 90015085, + "step": 4173, + "time_per_iteration": 2.7728993892669678 + }, + { + "auxiliary_loss_clip": 0.01122793, + "auxiliary_loss_mlp": 0.01085174, + "balance_loss_clip": 1.02751827, + "balance_loss_mlp": 1.00458848, + "epoch": 0.5018938255275657, + "flos": 16873203778560.0, + "grad_norm": 2.7699676760254137, + "language_loss": 0.73536164, + "learning_rate": 2.084882406392297e-06, + "loss": 0.75744134, + "num_input_tokens_seen": 90033045, + "step": 4174, + "time_per_iteration": 2.6655404567718506 + }, + { + "auxiliary_loss_clip": 0.01132552, + "auxiliary_loss_mlp": 0.01086211, + "balance_loss_clip": 1.02910805, + "balance_loss_mlp": 1.00567365, + "epoch": 0.5020140684182047, + "flos": 25515429073920.0, + "grad_norm": 3.0345197174927936, + "language_loss": 0.70744944, + "learning_rate": 2.0841041302061496e-06, + "loss": 0.72963709, + "num_input_tokens_seen": 90052505, + "step": 4175, + "time_per_iteration": 2.7604029178619385 + }, + { + "auxiliary_loss_clip": 0.01126607, + "auxiliary_loss_mlp": 0.01084751, + "balance_loss_clip": 1.03070354, + "balance_loss_mlp": 1.00421357, + "epoch": 0.5021343113088439, + "flos": 23659278791040.0, + "grad_norm": 1.761758215138054, + "language_loss": 0.75528884, + "learning_rate": 2.083325841261473e-06, + "loss": 0.77740246, + "num_input_tokens_seen": 90071565, + "step": 4176, + "time_per_iteration": 2.68979811668396 + }, + { + "auxiliary_loss_clip": 0.01124364, + "auxiliary_loss_mlp": 0.01084067, + "balance_loss_clip": 1.02841282, + "balance_loss_mlp": 1.00357711, + "epoch": 0.502254554199483, + "flos": 24534673148160.0, + "grad_norm": 2.2801742839652253, + "language_loss": 0.66231817, + "learning_rate": 2.0825475396763322e-06, + "loss": 0.68440247, + "num_input_tokens_seen": 90092215, + "step": 4177, + "time_per_iteration": 2.7243006229400635 + }, + { + "auxiliary_loss_clip": 0.01072308, + "auxiliary_loss_mlp": 0.01085125, + "balance_loss_clip": 1.0224272, + "balance_loss_mlp": 1.00463533, + "epoch": 0.502374797090122, + "flos": 34240285607040.0, + "grad_norm": 1.4100567447458605, + "language_loss": 0.65718573, + "learning_rate": 2.081769225568796e-06, + "loss": 0.67876005, + "num_input_tokens_seen": 90114665, + "step": 4178, + "time_per_iteration": 3.2414004802703857 + }, + { + "auxiliary_loss_clip": 0.01134182, + "auxiliary_loss_mlp": 0.01085788, + "balance_loss_clip": 1.02962732, + "balance_loss_mlp": 1.00515473, + "epoch": 0.5024950399807612, + "flos": 26031106679040.0, + "grad_norm": 3.831720238543852, + "language_loss": 0.75825059, + "learning_rate": 2.0809908990569327e-06, + "loss": 0.78045028, + "num_input_tokens_seen": 90136445, + "step": 4179, + "time_per_iteration": 3.249969244003296 + }, + { + "auxiliary_loss_clip": 0.01124057, + "auxiliary_loss_mlp": 0.0108606, + "balance_loss_clip": 1.02872634, + "balance_loss_mlp": 1.00552201, + "epoch": 0.5026152828714002, + "flos": 21252438120960.0, + "grad_norm": 2.450950082083911, + "language_loss": 0.78838634, + "learning_rate": 2.0802125602588146e-06, + "loss": 0.81048763, + "num_input_tokens_seen": 90155710, + "step": 4180, + "time_per_iteration": 2.7779622077941895 + }, + { + "auxiliary_loss_clip": 0.01142372, + "auxiliary_loss_mlp": 0.01085536, + "balance_loss_clip": 1.03027296, + "balance_loss_mlp": 1.0049026, + "epoch": 0.5027355257620393, + "flos": 30956111245440.0, + "grad_norm": 1.7480117701341267, + "language_loss": 0.66178125, + "learning_rate": 2.0794342092925146e-06, + "loss": 0.68406034, + "num_input_tokens_seen": 90176845, + "step": 4181, + "time_per_iteration": 2.726564645767212 + }, + { + "auxiliary_loss_clip": 0.0113435, + "auxiliary_loss_mlp": 0.01084282, + "balance_loss_clip": 1.03056908, + "balance_loss_mlp": 1.00374436, + "epoch": 0.5028557686526784, + "flos": 24791147233920.0, + "grad_norm": 5.036711736834002, + "language_loss": 0.6801188, + "learning_rate": 2.078655846276108e-06, + "loss": 0.70230514, + "num_input_tokens_seen": 90197175, + "step": 4182, + "time_per_iteration": 2.7698051929473877 + }, + { + "auxiliary_loss_clip": 0.01124913, + "auxiliary_loss_mlp": 0.01085281, + "balance_loss_clip": 1.02960873, + "balance_loss_mlp": 1.0046953, + "epoch": 0.5029760115433175, + "flos": 22966992990720.0, + "grad_norm": 2.0972280830256076, + "language_loss": 0.68644857, + "learning_rate": 2.0778774713276727e-06, + "loss": 0.70855057, + "num_input_tokens_seen": 90216650, + "step": 4183, + "time_per_iteration": 3.9072983264923096 + }, + { + "auxiliary_loss_clip": 0.01133114, + "auxiliary_loss_mlp": 0.01084786, + "balance_loss_clip": 1.0285604, + "balance_loss_mlp": 1.00415254, + "epoch": 0.5030962544339566, + "flos": 15305164485120.0, + "grad_norm": 2.3237294173371397, + "language_loss": 0.67524886, + "learning_rate": 2.077099084565287e-06, + "loss": 0.69742775, + "num_input_tokens_seen": 90234055, + "step": 4184, + "time_per_iteration": 3.506720781326294 + }, + { + "auxiliary_loss_clip": 0.01124156, + "auxiliary_loss_mlp": 0.01083941, + "balance_loss_clip": 1.02819228, + "balance_loss_mlp": 1.00335515, + "epoch": 0.5032164973245957, + "flos": 24494847943680.0, + "grad_norm": 1.9327160881145502, + "language_loss": 0.64918423, + "learning_rate": 2.0763206861070313e-06, + "loss": 0.67126524, + "num_input_tokens_seen": 90253115, + "step": 4185, + "time_per_iteration": 2.75134015083313 + }, + { + "auxiliary_loss_clip": 0.01141605, + "auxiliary_loss_mlp": 0.01086415, + "balance_loss_clip": 1.02962732, + "balance_loss_mlp": 1.00582957, + "epoch": 0.5033367402152348, + "flos": 16213452721920.0, + "grad_norm": 2.1503483460049857, + "language_loss": 0.74825281, + "learning_rate": 2.0755422760709876e-06, + "loss": 0.77053303, + "num_input_tokens_seen": 90270515, + "step": 4186, + "time_per_iteration": 3.6710126399993896 + }, + { + "auxiliary_loss_clip": 0.01092502, + "auxiliary_loss_mlp": 0.01087047, + "balance_loss_clip": 1.02938128, + "balance_loss_mlp": 1.00636625, + "epoch": 0.5034569831058738, + "flos": 21391375927680.0, + "grad_norm": 1.8802697421189996, + "language_loss": 0.7745136, + "learning_rate": 2.0747638545752417e-06, + "loss": 0.79630905, + "num_input_tokens_seen": 90289075, + "step": 4187, + "time_per_iteration": 2.772102117538452 + }, + { + "auxiliary_loss_clip": 0.01119932, + "auxiliary_loss_mlp": 0.01085588, + "balance_loss_clip": 1.02737617, + "balance_loss_mlp": 1.0050981, + "epoch": 0.503577225996513, + "flos": 20558751690240.0, + "grad_norm": 2.2261988412249742, + "language_loss": 0.83019269, + "learning_rate": 2.073985421737878e-06, + "loss": 0.85224783, + "num_input_tokens_seen": 90306385, + "step": 4188, + "time_per_iteration": 2.7178144454956055 + }, + { + "auxiliary_loss_clip": 0.01132819, + "auxiliary_loss_mlp": 0.0108485, + "balance_loss_clip": 1.02934361, + "balance_loss_mlp": 1.00426459, + "epoch": 0.5036974688871521, + "flos": 27229157930880.0, + "grad_norm": 2.092469525150144, + "language_loss": 0.73799795, + "learning_rate": 2.0732069776769844e-06, + "loss": 0.76017463, + "num_input_tokens_seen": 90323795, + "step": 4189, + "time_per_iteration": 3.6253654956817627 + }, + { + "auxiliary_loss_clip": 0.01142843, + "auxiliary_loss_mlp": 0.0108557, + "balance_loss_clip": 1.03082824, + "balance_loss_mlp": 1.00488865, + "epoch": 0.5038177117777911, + "flos": 20412164286720.0, + "grad_norm": 2.3771410222272404, + "language_loss": 0.73278737, + "learning_rate": 2.072428522510651e-06, + "loss": 0.75507158, + "num_input_tokens_seen": 90340360, + "step": 4190, + "time_per_iteration": 2.678117036819458 + }, + { + "auxiliary_loss_clip": 0.0110597, + "auxiliary_loss_mlp": 0.01085762, + "balance_loss_clip": 1.02587509, + "balance_loss_mlp": 1.00522447, + "epoch": 0.5039379546684303, + "flos": 21907987286400.0, + "grad_norm": 2.449881382757735, + "language_loss": 0.76184571, + "learning_rate": 2.071650056356968e-06, + "loss": 0.78376305, + "num_input_tokens_seen": 90357900, + "step": 4191, + "time_per_iteration": 2.747471570968628 + }, + { + "auxiliary_loss_clip": 0.01142351, + "auxiliary_loss_mlp": 0.01086368, + "balance_loss_clip": 1.02998257, + "balance_loss_mlp": 1.0057348, + "epoch": 0.5040581975590693, + "flos": 20010718909440.0, + "grad_norm": 1.8416232710333673, + "language_loss": 0.79929447, + "learning_rate": 2.070871579334028e-06, + "loss": 0.82158172, + "num_input_tokens_seen": 90377010, + "step": 4192, + "time_per_iteration": 2.6432981491088867 + }, + { + "auxiliary_loss_clip": 0.01140516, + "auxiliary_loss_mlp": 0.01085253, + "balance_loss_clip": 1.02900219, + "balance_loss_mlp": 1.00471556, + "epoch": 0.5041784404497084, + "flos": 20959837931520.0, + "grad_norm": 1.5907999539633693, + "language_loss": 0.71847802, + "learning_rate": 2.0700930915599264e-06, + "loss": 0.74073571, + "num_input_tokens_seen": 90396740, + "step": 4193, + "time_per_iteration": 2.6692097187042236 + }, + { + "auxiliary_loss_clip": 0.01141189, + "auxiliary_loss_mlp": 0.01085143, + "balance_loss_clip": 1.02934289, + "balance_loss_mlp": 1.00465322, + "epoch": 0.5042986833403476, + "flos": 12495082757760.0, + "grad_norm": 2.065098600180069, + "language_loss": 0.78258783, + "learning_rate": 2.0693145931527583e-06, + "loss": 0.80485117, + "num_input_tokens_seen": 90413220, + "step": 4194, + "time_per_iteration": 2.5983669757843018 + }, + { + "auxiliary_loss_clip": 0.01113577, + "auxiliary_loss_mlp": 0.01085175, + "balance_loss_clip": 1.02597988, + "balance_loss_mlp": 1.00458956, + "epoch": 0.5044189262309866, + "flos": 29202305788800.0, + "grad_norm": 1.5216703789393875, + "language_loss": 0.77763164, + "learning_rate": 2.068536084230622e-06, + "loss": 0.79961914, + "num_input_tokens_seen": 90435085, + "step": 4195, + "time_per_iteration": 2.8164966106414795 + }, + { + "auxiliary_loss_clip": 0.01130936, + "auxiliary_loss_mlp": 0.0108484, + "balance_loss_clip": 1.02838135, + "balance_loss_mlp": 1.00435019, + "epoch": 0.5045391691216257, + "flos": 23873198238720.0, + "grad_norm": 2.042370441142953, + "language_loss": 0.89058447, + "learning_rate": 2.067757564911616e-06, + "loss": 0.91274226, + "num_input_tokens_seen": 90453660, + "step": 4196, + "time_per_iteration": 2.6564602851867676 + }, + { + "auxiliary_loss_clip": 0.01124463, + "auxiliary_loss_mlp": 0.00873017, + "balance_loss_clip": 1.02843904, + "balance_loss_mlp": 1.0000912, + "epoch": 0.5046594120122648, + "flos": 24644990793600.0, + "grad_norm": 1.9861738045293393, + "language_loss": 0.92470956, + "learning_rate": 2.0669790353138407e-06, + "loss": 0.94468439, + "num_input_tokens_seen": 90472625, + "step": 4197, + "time_per_iteration": 2.7715463638305664 + }, + { + "auxiliary_loss_clip": 0.01102495, + "auxiliary_loss_mlp": 0.00873092, + "balance_loss_clip": 1.02453947, + "balance_loss_mlp": 1.00012279, + "epoch": 0.5047796549029039, + "flos": 23362835846400.0, + "grad_norm": 2.1345413886642906, + "language_loss": 0.73270428, + "learning_rate": 2.0662004955553995e-06, + "loss": 0.75246012, + "num_input_tokens_seen": 90492325, + "step": 4198, + "time_per_iteration": 2.8132898807525635 + }, + { + "auxiliary_loss_clip": 0.01125096, + "auxiliary_loss_mlp": 0.01086473, + "balance_loss_clip": 1.02925253, + "balance_loss_mlp": 1.00593495, + "epoch": 0.5048998977935429, + "flos": 17304095329920.0, + "grad_norm": 1.9779114848895123, + "language_loss": 0.77017778, + "learning_rate": 2.065421945754395e-06, + "loss": 0.79229343, + "num_input_tokens_seen": 90510055, + "step": 4199, + "time_per_iteration": 2.7234861850738525 + }, + { + "auxiliary_loss_clip": 0.01103762, + "auxiliary_loss_mlp": 0.01084696, + "balance_loss_clip": 1.02666342, + "balance_loss_mlp": 1.00425339, + "epoch": 0.505020140684182, + "flos": 34856979235200.0, + "grad_norm": 1.6945738745759102, + "language_loss": 0.78211951, + "learning_rate": 2.0646433860289344e-06, + "loss": 0.80400407, + "num_input_tokens_seen": 90528980, + "step": 4200, + "time_per_iteration": 2.9041178226470947 + }, + { + "auxiliary_loss_clip": 0.01131492, + "auxiliary_loss_mlp": 0.00873074, + "balance_loss_clip": 1.02744937, + "balance_loss_mlp": 1.00016499, + "epoch": 0.5051403835748212, + "flos": 24863974058880.0, + "grad_norm": 1.8070804098029727, + "language_loss": 0.82212734, + "learning_rate": 2.0638648164971233e-06, + "loss": 0.84217298, + "num_input_tokens_seen": 90547445, + "step": 4201, + "time_per_iteration": 2.7309041023254395 + }, + { + "auxiliary_loss_clip": 0.01115033, + "auxiliary_loss_mlp": 0.01085374, + "balance_loss_clip": 1.0266006, + "balance_loss_mlp": 1.00488353, + "epoch": 0.5052606264654602, + "flos": 20959694277120.0, + "grad_norm": 1.8212070561570153, + "language_loss": 0.88800365, + "learning_rate": 2.06308623727707e-06, + "loss": 0.91000772, + "num_input_tokens_seen": 90567545, + "step": 4202, + "time_per_iteration": 2.7329177856445312 + }, + { + "auxiliary_loss_clip": 0.01125431, + "auxiliary_loss_mlp": 0.01084807, + "balance_loss_clip": 1.0280416, + "balance_loss_mlp": 1.00426936, + "epoch": 0.5053808693560993, + "flos": 19642382893440.0, + "grad_norm": 2.3386717733045805, + "language_loss": 0.76457572, + "learning_rate": 2.0623076484868846e-06, + "loss": 0.78667808, + "num_input_tokens_seen": 90585000, + "step": 4203, + "time_per_iteration": 2.6282434463500977 + }, + { + "auxiliary_loss_clip": 0.01102739, + "auxiliary_loss_mlp": 0.01079742, + "balance_loss_clip": 1.02245641, + "balance_loss_mlp": 1.00039637, + "epoch": 0.5055011122467384, + "flos": 67504915019520.0, + "grad_norm": 0.9249270457522164, + "language_loss": 0.60697395, + "learning_rate": 2.061529050244679e-06, + "loss": 0.62879878, + "num_input_tokens_seen": 90644745, + "step": 4204, + "time_per_iteration": 3.223998546600342 + }, + { + "auxiliary_loss_clip": 0.01099573, + "auxiliary_loss_mlp": 0.0108609, + "balance_loss_clip": 1.02865338, + "balance_loss_mlp": 1.0055995, + "epoch": 0.5056213551373775, + "flos": 16872952383360.0, + "grad_norm": 1.8240898877449911, + "language_loss": 0.74418736, + "learning_rate": 2.060750442668565e-06, + "loss": 0.76604402, + "num_input_tokens_seen": 90662500, + "step": 4205, + "time_per_iteration": 2.7010204792022705 + }, + { + "auxiliary_loss_clip": 0.01132416, + "auxiliary_loss_mlp": 0.01086337, + "balance_loss_clip": 1.02971244, + "balance_loss_mlp": 1.0058949, + "epoch": 0.5057415980280165, + "flos": 15334179696000.0, + "grad_norm": 2.1261499436372024, + "language_loss": 0.63498318, + "learning_rate": 2.059971825876657e-06, + "loss": 0.65717071, + "num_input_tokens_seen": 90677010, + "step": 4206, + "time_per_iteration": 2.7575531005859375 + }, + { + "auxiliary_loss_clip": 0.01132819, + "auxiliary_loss_mlp": 0.01084997, + "balance_loss_clip": 1.02893817, + "balance_loss_mlp": 1.00450647, + "epoch": 0.5058618409186557, + "flos": 19025976574080.0, + "grad_norm": 1.8237470921271544, + "language_loss": 0.76485121, + "learning_rate": 2.0591931999870713e-06, + "loss": 0.78702933, + "num_input_tokens_seen": 90695935, + "step": 4207, + "time_per_iteration": 2.6692988872528076 + }, + { + "auxiliary_loss_clip": 0.01110551, + "auxiliary_loss_mlp": 0.0107916, + "balance_loss_clip": 1.02216721, + "balance_loss_mlp": 1.00019574, + "epoch": 0.5059820838092948, + "flos": 63453114080640.0, + "grad_norm": 0.8177441494369929, + "language_loss": 0.57810181, + "learning_rate": 2.0584145651179234e-06, + "loss": 0.59999895, + "num_input_tokens_seen": 90751645, + "step": 4208, + "time_per_iteration": 4.101560592651367 + }, + { + "auxiliary_loss_clip": 0.01120621, + "auxiliary_loss_mlp": 0.00873026, + "balance_loss_clip": 1.02746332, + "balance_loss_mlp": 1.00015318, + "epoch": 0.5061023266999338, + "flos": 15441803821440.0, + "grad_norm": 2.216684821722388, + "language_loss": 0.79916346, + "learning_rate": 2.0576359213873327e-06, + "loss": 0.8190999, + "num_input_tokens_seen": 90766795, + "step": 4209, + "time_per_iteration": 3.7377192974090576 + }, + { + "auxiliary_loss_clip": 0.01123698, + "auxiliary_loss_mlp": 0.01085008, + "balance_loss_clip": 1.02766562, + "balance_loss_mlp": 1.00427938, + "epoch": 0.506222569590573, + "flos": 22451063990400.0, + "grad_norm": 2.053512198748929, + "language_loss": 0.70218998, + "learning_rate": 2.056857268913419e-06, + "loss": 0.72427702, + "num_input_tokens_seen": 90786845, + "step": 4210, + "time_per_iteration": 2.72892427444458 + }, + { + "auxiliary_loss_clip": 0.01132092, + "auxiliary_loss_mlp": 0.0108626, + "balance_loss_clip": 1.02936637, + "balance_loss_mlp": 1.00567484, + "epoch": 0.506342812481212, + "flos": 17558665994880.0, + "grad_norm": 2.321316883904444, + "language_loss": 0.84311545, + "learning_rate": 2.056078607814303e-06, + "loss": 0.86529893, + "num_input_tokens_seen": 90802630, + "step": 4211, + "time_per_iteration": 2.7112622261047363 + }, + { + "auxiliary_loss_clip": 0.01131653, + "auxiliary_loss_mlp": 0.01085002, + "balance_loss_clip": 1.02951097, + "balance_loss_mlp": 1.00446379, + "epoch": 0.5064630553718511, + "flos": 23402050519680.0, + "grad_norm": 1.573477818776084, + "language_loss": 0.78262556, + "learning_rate": 2.055299938208106e-06, + "loss": 0.80479217, + "num_input_tokens_seen": 90823620, + "step": 4212, + "time_per_iteration": 3.7429165840148926 + }, + { + "auxiliary_loss_clip": 0.01132438, + "auxiliary_loss_mlp": 0.01084886, + "balance_loss_clip": 1.02903175, + "balance_loss_mlp": 1.0043962, + "epoch": 0.5065832982624903, + "flos": 23987035416960.0, + "grad_norm": 1.6358195200274566, + "language_loss": 0.86161268, + "learning_rate": 2.0545212602129526e-06, + "loss": 0.8837859, + "num_input_tokens_seen": 90843475, + "step": 4213, + "time_per_iteration": 2.663578987121582 + }, + { + "auxiliary_loss_clip": 0.01116884, + "auxiliary_loss_mlp": 0.01085323, + "balance_loss_clip": 1.02788162, + "balance_loss_mlp": 1.00478506, + "epoch": 0.5067035411531293, + "flos": 21503058289920.0, + "grad_norm": 2.0873219022167855, + "language_loss": 0.66162121, + "learning_rate": 2.0537425739469673e-06, + "loss": 0.68364322, + "num_input_tokens_seen": 90862410, + "step": 4214, + "time_per_iteration": 2.7311253547668457 + }, + { + "auxiliary_loss_clip": 0.01115726, + "auxiliary_loss_mlp": 0.01079251, + "balance_loss_clip": 1.02705908, + "balance_loss_mlp": 0.99990505, + "epoch": 0.5068237840437684, + "flos": 65934397687680.0, + "grad_norm": 0.8481290520888525, + "language_loss": 0.59566879, + "learning_rate": 2.052963879528276e-06, + "loss": 0.61761856, + "num_input_tokens_seen": 90922280, + "step": 4215, + "time_per_iteration": 4.140263557434082 + }, + { + "auxiliary_loss_clip": 0.01130658, + "auxiliary_loss_mlp": 0.01084298, + "balance_loss_clip": 1.02831292, + "balance_loss_mlp": 1.00375986, + "epoch": 0.5069440269344075, + "flos": 27264206626560.0, + "grad_norm": 1.9684546866912256, + "language_loss": 0.76449609, + "learning_rate": 2.052185177075007e-06, + "loss": 0.78664565, + "num_input_tokens_seen": 90941850, + "step": 4216, + "time_per_iteration": 2.7094123363494873 + }, + { + "auxiliary_loss_clip": 0.01133204, + "auxiliary_loss_mlp": 0.01085766, + "balance_loss_clip": 1.02922499, + "balance_loss_mlp": 1.00498974, + "epoch": 0.5070642698250466, + "flos": 23366319465600.0, + "grad_norm": 1.6089270162929437, + "language_loss": 0.82730675, + "learning_rate": 2.051406466705288e-06, + "loss": 0.84949636, + "num_input_tokens_seen": 90961390, + "step": 4217, + "time_per_iteration": 2.747345447540283 + }, + { + "auxiliary_loss_clip": 0.01140585, + "auxiliary_loss_mlp": 0.01085269, + "balance_loss_clip": 1.028404, + "balance_loss_mlp": 1.00482678, + "epoch": 0.5071845127156857, + "flos": 20340127560960.0, + "grad_norm": 1.8675865006671597, + "language_loss": 0.80964655, + "learning_rate": 2.0506277485372486e-06, + "loss": 0.83190507, + "num_input_tokens_seen": 90980215, + "step": 4218, + "time_per_iteration": 2.6112308502197266 + }, + { + "auxiliary_loss_clip": 0.01133994, + "auxiliary_loss_mlp": 0.01086079, + "balance_loss_clip": 1.02936959, + "balance_loss_mlp": 1.00554156, + "epoch": 0.5073047556063248, + "flos": 12092955022080.0, + "grad_norm": 2.12588046421018, + "language_loss": 0.66876376, + "learning_rate": 2.04984902268902e-06, + "loss": 0.69096446, + "num_input_tokens_seen": 90997415, + "step": 4219, + "time_per_iteration": 2.6449830532073975 + }, + { + "auxiliary_loss_clip": 0.01132601, + "auxiliary_loss_mlp": 0.01085837, + "balance_loss_clip": 1.02802324, + "balance_loss_mlp": 1.0051558, + "epoch": 0.5074249984969639, + "flos": 19682854542720.0, + "grad_norm": 2.1325185935984137, + "language_loss": 0.75155467, + "learning_rate": 2.0490702892787345e-06, + "loss": 0.77373898, + "num_input_tokens_seen": 91016475, + "step": 4220, + "time_per_iteration": 2.58585262298584 + }, + { + "auxiliary_loss_clip": 0.01132764, + "auxiliary_loss_mlp": 0.01084587, + "balance_loss_clip": 1.0285821, + "balance_loss_mlp": 1.00404906, + "epoch": 0.5075452413876029, + "flos": 28765703975040.0, + "grad_norm": 1.829585744357122, + "language_loss": 0.6225729, + "learning_rate": 2.0482915484245246e-06, + "loss": 0.64474642, + "num_input_tokens_seen": 91038095, + "step": 4221, + "time_per_iteration": 2.6370961666107178 + }, + { + "auxiliary_loss_clip": 0.01098988, + "auxiliary_loss_mlp": 0.01085392, + "balance_loss_clip": 1.02644646, + "balance_loss_mlp": 1.00475907, + "epoch": 0.5076654842782421, + "flos": 20339445202560.0, + "grad_norm": 2.3910564868776016, + "language_loss": 0.84399015, + "learning_rate": 2.047512800244526e-06, + "loss": 0.86583394, + "num_input_tokens_seen": 91053360, + "step": 4222, + "time_per_iteration": 2.662569522857666 + }, + { + "auxiliary_loss_clip": 0.01131152, + "auxiliary_loss_mlp": 0.0108692, + "balance_loss_clip": 1.0289855, + "balance_loss_mlp": 1.00633478, + "epoch": 0.5077857271688812, + "flos": 26359653404160.0, + "grad_norm": 1.996836248748491, + "language_loss": 0.78777218, + "learning_rate": 2.046734044856873e-06, + "loss": 0.80995297, + "num_input_tokens_seen": 91072770, + "step": 4223, + "time_per_iteration": 2.5670695304870605 + }, + { + "auxiliary_loss_clip": 0.01131549, + "auxiliary_loss_mlp": 0.0108605, + "balance_loss_clip": 1.02874851, + "balance_loss_mlp": 1.00546432, + "epoch": 0.5079059700595202, + "flos": 21798962530560.0, + "grad_norm": 1.8575978000499198, + "language_loss": 0.81641775, + "learning_rate": 2.045955282379702e-06, + "loss": 0.83859372, + "num_input_tokens_seen": 91091430, + "step": 4224, + "time_per_iteration": 2.5935921669006348 + }, + { + "auxiliary_loss_clip": 0.01133898, + "auxiliary_loss_mlp": 0.01086523, + "balance_loss_clip": 1.02987266, + "balance_loss_mlp": 1.00588942, + "epoch": 0.5080262129501594, + "flos": 13187943175680.0, + "grad_norm": 3.534453562055388, + "language_loss": 0.75687867, + "learning_rate": 2.045176512931152e-06, + "loss": 0.77908289, + "num_input_tokens_seen": 91106060, + "step": 4225, + "time_per_iteration": 2.667754650115967 + }, + { + "auxiliary_loss_clip": 0.01096972, + "auxiliary_loss_mlp": 0.01085193, + "balance_loss_clip": 1.02744877, + "balance_loss_mlp": 1.00479794, + "epoch": 0.5081464558407984, + "flos": 25301473712640.0, + "grad_norm": 1.8982771536098724, + "language_loss": 0.75756729, + "learning_rate": 2.0443977366293604e-06, + "loss": 0.7793889, + "num_input_tokens_seen": 91124100, + "step": 4226, + "time_per_iteration": 2.747760534286499 + }, + { + "auxiliary_loss_clip": 0.01097171, + "auxiliary_loss_mlp": 0.01086377, + "balance_loss_clip": 1.02674603, + "balance_loss_mlp": 1.00564861, + "epoch": 0.5082666987314375, + "flos": 30951226995840.0, + "grad_norm": 1.4475625065778832, + "language_loss": 0.76889324, + "learning_rate": 2.043618953592468e-06, + "loss": 0.79072869, + "num_input_tokens_seen": 91146555, + "step": 4227, + "time_per_iteration": 2.9832026958465576 + }, + { + "auxiliary_loss_clip": 0.01120655, + "auxiliary_loss_mlp": 0.01086054, + "balance_loss_clip": 1.02745914, + "balance_loss_mlp": 1.00537348, + "epoch": 0.5083869416220766, + "flos": 19682495406720.0, + "grad_norm": 1.6164388731365833, + "language_loss": 0.81053358, + "learning_rate": 2.0428401639386144e-06, + "loss": 0.83260071, + "num_input_tokens_seen": 91167120, + "step": 4228, + "time_per_iteration": 2.767404556274414 + }, + { + "auxiliary_loss_clip": 0.01107086, + "auxiliary_loss_mlp": 0.01079128, + "balance_loss_clip": 1.02736044, + "balance_loss_mlp": 1.00016367, + "epoch": 0.5085071845127157, + "flos": 71817535589760.0, + "grad_norm": 0.8207335114504344, + "language_loss": 0.58174503, + "learning_rate": 2.042061367785943e-06, + "loss": 0.60360718, + "num_input_tokens_seen": 91220260, + "step": 4229, + "time_per_iteration": 3.282897472381592 + }, + { + "auxiliary_loss_clip": 0.01112409, + "auxiliary_loss_mlp": 0.01085634, + "balance_loss_clip": 1.02620387, + "balance_loss_mlp": 1.00495362, + "epoch": 0.5086274274033548, + "flos": 35951608252800.0, + "grad_norm": 2.286563634917066, + "language_loss": 0.75093377, + "learning_rate": 2.041282565252594e-06, + "loss": 0.77291423, + "num_input_tokens_seen": 91240425, + "step": 4230, + "time_per_iteration": 2.8357839584350586 + }, + { + "auxiliary_loss_clip": 0.01115277, + "auxiliary_loss_mlp": 0.01085324, + "balance_loss_clip": 1.02801144, + "balance_loss_mlp": 1.00488186, + "epoch": 0.5087476702939938, + "flos": 23513732881920.0, + "grad_norm": 1.5671860345059319, + "language_loss": 0.76916349, + "learning_rate": 2.040503756456714e-06, + "loss": 0.79116952, + "num_input_tokens_seen": 91259635, + "step": 4231, + "time_per_iteration": 2.717071056365967 + }, + { + "auxiliary_loss_clip": 0.01133079, + "auxiliary_loss_mlp": 0.0108429, + "balance_loss_clip": 1.02890003, + "balance_loss_mlp": 1.00380015, + "epoch": 0.508867913184633, + "flos": 15122091841920.0, + "grad_norm": 2.0526651355342715, + "language_loss": 0.78799748, + "learning_rate": 2.0397249415164456e-06, + "loss": 0.81017119, + "num_input_tokens_seen": 91276990, + "step": 4232, + "time_per_iteration": 2.57814621925354 + }, + { + "auxiliary_loss_clip": 0.01123954, + "auxiliary_loss_mlp": 0.01084072, + "balance_loss_clip": 1.0286212, + "balance_loss_mlp": 1.00353408, + "epoch": 0.508988156075272, + "flos": 25885309374720.0, + "grad_norm": 1.5490152584152779, + "language_loss": 0.80205733, + "learning_rate": 2.0389461205499354e-06, + "loss": 0.82413763, + "num_input_tokens_seen": 91296125, + "step": 4233, + "time_per_iteration": 3.6804988384246826 + }, + { + "auxiliary_loss_clip": 0.01113366, + "auxiliary_loss_mlp": 0.01085317, + "balance_loss_clip": 1.02725029, + "balance_loss_mlp": 1.0047313, + "epoch": 0.5091083989659111, + "flos": 13844857057920.0, + "grad_norm": 1.9943962220645117, + "language_loss": 0.73743063, + "learning_rate": 2.03816729367533e-06, + "loss": 0.75941741, + "num_input_tokens_seen": 91314280, + "step": 4234, + "time_per_iteration": 3.621516227722168 + }, + { + "auxiliary_loss_clip": 0.01118333, + "auxiliary_loss_mlp": 0.01085468, + "balance_loss_clip": 1.02498794, + "balance_loss_mlp": 1.00497806, + "epoch": 0.5092286418565503, + "flos": 21104881050240.0, + "grad_norm": 1.9307305546084699, + "language_loss": 0.71467859, + "learning_rate": 2.0373884610107765e-06, + "loss": 0.73671657, + "num_input_tokens_seen": 91334595, + "step": 4235, + "time_per_iteration": 2.769803285598755 + }, + { + "auxiliary_loss_clip": 0.01132366, + "auxiliary_loss_mlp": 0.01085179, + "balance_loss_clip": 1.02822924, + "balance_loss_mlp": 1.00459313, + "epoch": 0.5093488847471893, + "flos": 18621298972800.0, + "grad_norm": 4.598701699292596, + "language_loss": 0.69737637, + "learning_rate": 2.0366096226744225e-06, + "loss": 0.7195518, + "num_input_tokens_seen": 91349790, + "step": 4236, + "time_per_iteration": 2.715473175048828 + }, + { + "auxiliary_loss_clip": 0.01133022, + "auxiliary_loss_mlp": 0.01085154, + "balance_loss_clip": 1.02944398, + "balance_loss_mlp": 1.0046165, + "epoch": 0.5094691276378284, + "flos": 23803783205760.0, + "grad_norm": 1.9435006940185724, + "language_loss": 0.7669307, + "learning_rate": 2.035830778784418e-06, + "loss": 0.78911245, + "num_input_tokens_seen": 91370465, + "step": 4237, + "time_per_iteration": 3.568228244781494 + }, + { + "auxiliary_loss_clip": 0.01108596, + "auxiliary_loss_mlp": 0.01085327, + "balance_loss_clip": 1.03038764, + "balance_loss_mlp": 1.00478888, + "epoch": 0.5095893705284675, + "flos": 17420410546560.0, + "grad_norm": 1.7741522550784146, + "language_loss": 0.80350482, + "learning_rate": 2.0350519294589134e-06, + "loss": 0.82544404, + "num_input_tokens_seen": 91388505, + "step": 4238, + "time_per_iteration": 2.787616491317749 + }, + { + "auxiliary_loss_clip": 0.01094198, + "auxiliary_loss_mlp": 0.01085273, + "balance_loss_clip": 1.02294767, + "balance_loss_mlp": 1.00463986, + "epoch": 0.5097096134191066, + "flos": 25849362839040.0, + "grad_norm": 2.0790954860293973, + "language_loss": 0.82856858, + "learning_rate": 2.0342730748160588e-06, + "loss": 0.85036325, + "num_input_tokens_seen": 91408970, + "step": 4239, + "time_per_iteration": 2.8291308879852295 + }, + { + "auxiliary_loss_clip": 0.01123069, + "auxiliary_loss_mlp": 0.01085776, + "balance_loss_clip": 1.02834129, + "balance_loss_mlp": 1.00519061, + "epoch": 0.5098298563097456, + "flos": 27745122844800.0, + "grad_norm": 2.1210342735994776, + "language_loss": 0.70280868, + "learning_rate": 2.033494214974006e-06, + "loss": 0.72489715, + "num_input_tokens_seen": 91430115, + "step": 4240, + "time_per_iteration": 3.659254312515259 + }, + { + "auxiliary_loss_clip": 0.01122003, + "auxiliary_loss_mlp": 0.01084167, + "balance_loss_clip": 1.02776885, + "balance_loss_mlp": 1.00372493, + "epoch": 0.5099500992003848, + "flos": 21358913011200.0, + "grad_norm": 1.6297362018411625, + "language_loss": 0.83989769, + "learning_rate": 2.0327153500509067e-06, + "loss": 0.86195934, + "num_input_tokens_seen": 91449140, + "step": 4241, + "time_per_iteration": 2.7335734367370605 + }, + { + "auxiliary_loss_clip": 0.01121603, + "auxiliary_loss_mlp": 0.01084688, + "balance_loss_clip": 1.02767372, + "balance_loss_mlp": 1.00410271, + "epoch": 0.5100703420910239, + "flos": 19865999013120.0, + "grad_norm": 3.485053977923276, + "language_loss": 0.84787512, + "learning_rate": 2.031936480164916e-06, + "loss": 0.86993802, + "num_input_tokens_seen": 91466880, + "step": 4242, + "time_per_iteration": 2.6662285327911377 + }, + { + "auxiliary_loss_clip": 0.0111502, + "auxiliary_loss_mlp": 0.01085773, + "balance_loss_clip": 1.0275557, + "balance_loss_mlp": 1.0052352, + "epoch": 0.5101905849816629, + "flos": 24648797635200.0, + "grad_norm": 5.09167983549448, + "language_loss": 0.80314863, + "learning_rate": 2.0311576054341857e-06, + "loss": 0.82515657, + "num_input_tokens_seen": 91487495, + "step": 4243, + "time_per_iteration": 2.7525525093078613 + }, + { + "auxiliary_loss_clip": 0.0114358, + "auxiliary_loss_mlp": 0.01084623, + "balance_loss_clip": 1.03132689, + "balance_loss_mlp": 1.00403738, + "epoch": 0.5103108278723021, + "flos": 22930076787840.0, + "grad_norm": 1.5889515117426518, + "language_loss": 0.62685406, + "learning_rate": 2.0303787259768715e-06, + "loss": 0.64913613, + "num_input_tokens_seen": 91508395, + "step": 4244, + "time_per_iteration": 2.666621446609497 + }, + { + "auxiliary_loss_clip": 0.01122366, + "auxiliary_loss_mlp": 0.01085414, + "balance_loss_clip": 1.02921605, + "balance_loss_mlp": 1.00482869, + "epoch": 0.5104310707629411, + "flos": 21506613736320.0, + "grad_norm": 2.4518768448995916, + "language_loss": 0.69123614, + "learning_rate": 2.0295998419111294e-06, + "loss": 0.71331388, + "num_input_tokens_seen": 91525685, + "step": 4245, + "time_per_iteration": 2.6962358951568604 + }, + { + "auxiliary_loss_clip": 0.01096486, + "auxiliary_loss_mlp": 0.01084819, + "balance_loss_clip": 1.0266242, + "balance_loss_mlp": 1.00428152, + "epoch": 0.5105513136535802, + "flos": 14903180403840.0, + "grad_norm": 2.3630351959855234, + "language_loss": 0.73863363, + "learning_rate": 2.028820953355115e-06, + "loss": 0.76044667, + "num_input_tokens_seen": 91543785, + "step": 4246, + "time_per_iteration": 2.752746105194092 + }, + { + "auxiliary_loss_clip": 0.01123624, + "auxiliary_loss_mlp": 0.01085149, + "balance_loss_clip": 1.02776384, + "balance_loss_mlp": 1.00456369, + "epoch": 0.5106715565442194, + "flos": 22602212421120.0, + "grad_norm": 3.5231343960332624, + "language_loss": 0.7878139, + "learning_rate": 2.0280420604269834e-06, + "loss": 0.80990165, + "num_input_tokens_seen": 91563325, + "step": 4247, + "time_per_iteration": 2.7677605152130127 + }, + { + "auxiliary_loss_clip": 0.01117072, + "auxiliary_loss_mlp": 0.01079651, + "balance_loss_clip": 1.02874267, + "balance_loss_mlp": 1.00030506, + "epoch": 0.5107917994348584, + "flos": 71027645558400.0, + "grad_norm": 0.7207545721370784, + "language_loss": 0.58925426, + "learning_rate": 2.027263163244895e-06, + "loss": 0.61122155, + "num_input_tokens_seen": 91632450, + "step": 4248, + "time_per_iteration": 3.3802239894866943 + }, + { + "auxiliary_loss_clip": 0.01132135, + "auxiliary_loss_mlp": 0.01085608, + "balance_loss_clip": 1.02979612, + "balance_loss_mlp": 1.00516582, + "epoch": 0.5109120423254975, + "flos": 24827416992000.0, + "grad_norm": 1.5601474903461567, + "language_loss": 0.74414337, + "learning_rate": 2.026484261927005e-06, + "loss": 0.76632077, + "num_input_tokens_seen": 91651945, + "step": 4249, + "time_per_iteration": 2.739809274673462 + }, + { + "auxiliary_loss_clip": 0.01134024, + "auxiliary_loss_mlp": 0.01085433, + "balance_loss_clip": 1.02996969, + "balance_loss_mlp": 1.00489473, + "epoch": 0.5110322852161366, + "flos": 21247661612160.0, + "grad_norm": 2.2673704485637516, + "language_loss": 0.74057508, + "learning_rate": 2.025705356591475e-06, + "loss": 0.7627697, + "num_input_tokens_seen": 91669635, + "step": 4250, + "time_per_iteration": 2.7810206413269043 + }, + { + "auxiliary_loss_clip": 0.01099532, + "auxiliary_loss_mlp": 0.0087309, + "balance_loss_clip": 1.02776527, + "balance_loss_mlp": 1.00181723, + "epoch": 0.5111525281067757, + "flos": 66457114358400.0, + "grad_norm": 0.7936335219744992, + "language_loss": 0.57967168, + "learning_rate": 2.024926447356462e-06, + "loss": 0.5993979, + "num_input_tokens_seen": 91731920, + "step": 4251, + "time_per_iteration": 3.215733051300049 + }, + { + "auxiliary_loss_clip": 0.01125705, + "auxiliary_loss_mlp": 0.01085873, + "balance_loss_clip": 1.02839339, + "balance_loss_mlp": 1.00523984, + "epoch": 0.5112727709974147, + "flos": 14866731077760.0, + "grad_norm": 1.924656889340185, + "language_loss": 0.7870568, + "learning_rate": 2.024147534340127e-06, + "loss": 0.80917263, + "num_input_tokens_seen": 91749780, + "step": 4252, + "time_per_iteration": 2.6766366958618164 + }, + { + "auxiliary_loss_clip": 0.01107473, + "auxiliary_loss_mlp": 0.01084957, + "balance_loss_clip": 1.02754211, + "balance_loss_mlp": 1.00437152, + "epoch": 0.5113930138880539, + "flos": 21177600134400.0, + "grad_norm": 1.693471942139995, + "language_loss": 0.79814279, + "learning_rate": 2.02336861766063e-06, + "loss": 0.82006705, + "num_input_tokens_seen": 91768840, + "step": 4253, + "time_per_iteration": 2.716398239135742 + }, + { + "auxiliary_loss_clip": 0.01134271, + "auxiliary_loss_mlp": 0.0108491, + "balance_loss_clip": 1.0299536, + "balance_loss_mlp": 1.00427711, + "epoch": 0.511513256778693, + "flos": 20409111630720.0, + "grad_norm": 1.5453398301270178, + "language_loss": 0.78722721, + "learning_rate": 2.0225896974361327e-06, + "loss": 0.80941904, + "num_input_tokens_seen": 91788945, + "step": 4254, + "time_per_iteration": 2.6757969856262207 + }, + { + "auxiliary_loss_clip": 0.0110203, + "auxiliary_loss_mlp": 0.01078961, + "balance_loss_clip": 1.03053665, + "balance_loss_mlp": 0.99999678, + "epoch": 0.511633499669332, + "flos": 69879975131520.0, + "grad_norm": 0.8502605502001167, + "language_loss": 0.59926045, + "learning_rate": 2.0218107737847962e-06, + "loss": 0.62107038, + "num_input_tokens_seen": 91850990, + "step": 4255, + "time_per_iteration": 3.3452227115631104 + }, + { + "auxiliary_loss_clip": 0.01142768, + "auxiliary_loss_mlp": 0.01084276, + "balance_loss_clip": 1.03108871, + "balance_loss_mlp": 1.00378585, + "epoch": 0.5117537425599712, + "flos": 24097855852800.0, + "grad_norm": 1.7582444211390442, + "language_loss": 0.74354708, + "learning_rate": 2.0210318468247826e-06, + "loss": 0.76581752, + "num_input_tokens_seen": 91869960, + "step": 4256, + "time_per_iteration": 2.69547438621521 + }, + { + "auxiliary_loss_clip": 0.01122019, + "auxiliary_loss_mlp": 0.01083285, + "balance_loss_clip": 1.02814865, + "balance_loss_mlp": 1.00284278, + "epoch": 0.5118739854506102, + "flos": 20959550622720.0, + "grad_norm": 1.6960714170263989, + "language_loss": 0.81501287, + "learning_rate": 2.020252916674255e-06, + "loss": 0.83706594, + "num_input_tokens_seen": 91889075, + "step": 4257, + "time_per_iteration": 2.7575955390930176 + }, + { + "auxiliary_loss_clip": 0.01133834, + "auxiliary_loss_mlp": 0.01086779, + "balance_loss_clip": 1.02989674, + "balance_loss_mlp": 1.00624096, + "epoch": 0.5119942283412493, + "flos": 17457326749440.0, + "grad_norm": 1.6448336313244247, + "language_loss": 0.81165582, + "learning_rate": 2.019473983451375e-06, + "loss": 0.83386201, + "num_input_tokens_seen": 91907495, + "step": 4258, + "time_per_iteration": 2.637908697128296 + }, + { + "auxiliary_loss_clip": 0.01098275, + "auxiliary_loss_mlp": 0.01087409, + "balance_loss_clip": 1.0274291, + "balance_loss_mlp": 1.00677562, + "epoch": 0.5121144712318885, + "flos": 21066743784960.0, + "grad_norm": 1.74025629602126, + "language_loss": 0.71481848, + "learning_rate": 2.0186950472743076e-06, + "loss": 0.73667526, + "num_input_tokens_seen": 91927400, + "step": 4259, + "time_per_iteration": 4.564378976821899 + }, + { + "auxiliary_loss_clip": 0.01141198, + "auxiliary_loss_mlp": 0.01085959, + "balance_loss_clip": 1.02897835, + "balance_loss_mlp": 1.00551653, + "epoch": 0.5122347141225275, + "flos": 19860791541120.0, + "grad_norm": 1.5542646940373348, + "language_loss": 0.74111271, + "learning_rate": 2.0179161082612162e-06, + "loss": 0.76338428, + "num_input_tokens_seen": 91946790, + "step": 4260, + "time_per_iteration": 2.6967642307281494 + }, + { + "auxiliary_loss_clip": 0.01121845, + "auxiliary_loss_mlp": 0.01085114, + "balance_loss_clip": 1.02744126, + "balance_loss_mlp": 1.004529, + "epoch": 0.5123549570131666, + "flos": 22528487756160.0, + "grad_norm": 2.355936838996073, + "language_loss": 0.72608012, + "learning_rate": 2.017137166530266e-06, + "loss": 0.74814975, + "num_input_tokens_seen": 91966325, + "step": 4261, + "time_per_iteration": 2.729825496673584 + }, + { + "auxiliary_loss_clip": 0.01124785, + "auxiliary_loss_mlp": 0.0108528, + "balance_loss_clip": 1.02957296, + "balance_loss_mlp": 1.00479007, + "epoch": 0.5124751999038056, + "flos": 20333375804160.0, + "grad_norm": 1.9651543119990753, + "language_loss": 0.80124259, + "learning_rate": 2.0163582221996213e-06, + "loss": 0.82334322, + "num_input_tokens_seen": 91984700, + "step": 4262, + "time_per_iteration": 2.8155949115753174 + }, + { + "auxiliary_loss_clip": 0.01121447, + "auxiliary_loss_mlp": 0.01084511, + "balance_loss_clip": 1.02724898, + "balance_loss_mlp": 1.00397336, + "epoch": 0.5125954427944448, + "flos": 39785970211200.0, + "grad_norm": 3.6355107842339858, + "language_loss": 0.68477798, + "learning_rate": 2.015579275387446e-06, + "loss": 0.70683753, + "num_input_tokens_seen": 92010020, + "step": 4263, + "time_per_iteration": 3.87837290763855 + }, + { + "auxiliary_loss_clip": 0.01123444, + "auxiliary_loss_mlp": 0.0108562, + "balance_loss_clip": 1.02895093, + "balance_loss_mlp": 1.004987, + "epoch": 0.5127156856850839, + "flos": 29205394358400.0, + "grad_norm": 3.011192096666626, + "language_loss": 0.68762875, + "learning_rate": 2.0148003262119085e-06, + "loss": 0.70971942, + "num_input_tokens_seen": 92030990, + "step": 4264, + "time_per_iteration": 2.772552490234375 + }, + { + "auxiliary_loss_clip": 0.01104757, + "auxiliary_loss_mlp": 0.01085745, + "balance_loss_clip": 1.02544332, + "balance_loss_mlp": 1.00506401, + "epoch": 0.5128359285757229, + "flos": 13553693412480.0, + "grad_norm": 1.7483911070909939, + "language_loss": 0.76463312, + "learning_rate": 2.0140213747911728e-06, + "loss": 0.78653812, + "num_input_tokens_seen": 92049525, + "step": 4265, + "time_per_iteration": 2.76046085357666 + }, + { + "auxiliary_loss_clip": 0.01110551, + "auxiliary_loss_mlp": 0.01085518, + "balance_loss_clip": 1.0266304, + "balance_loss_mlp": 1.0049808, + "epoch": 0.5129561714663621, + "flos": 25192089820800.0, + "grad_norm": 2.343189338841662, + "language_loss": 0.80628693, + "learning_rate": 2.013242421243406e-06, + "loss": 0.82824767, + "num_input_tokens_seen": 92068430, + "step": 4266, + "time_per_iteration": 3.6945550441741943 + }, + { + "auxiliary_loss_clip": 0.01101845, + "auxiliary_loss_mlp": 0.01084245, + "balance_loss_clip": 1.02656007, + "balance_loss_mlp": 1.00389802, + "epoch": 0.5130764143570011, + "flos": 18150223080960.0, + "grad_norm": 1.4891278246293256, + "language_loss": 0.79023612, + "learning_rate": 2.012463465686774e-06, + "loss": 0.81209695, + "num_input_tokens_seen": 92088180, + "step": 4267, + "time_per_iteration": 2.813819169998169 + }, + { + "auxiliary_loss_clip": 0.01065804, + "auxiliary_loss_mlp": 0.0107909, + "balance_loss_clip": 1.01868486, + "balance_loss_mlp": 1.00012541, + "epoch": 0.5131966572476402, + "flos": 59794896418560.0, + "grad_norm": 0.759555350316439, + "language_loss": 0.54804027, + "learning_rate": 2.0116845082394446e-06, + "loss": 0.56948918, + "num_input_tokens_seen": 92153015, + "step": 4268, + "time_per_iteration": 3.3434324264526367 + }, + { + "auxiliary_loss_clip": 0.01134378, + "auxiliary_loss_mlp": 0.01084552, + "balance_loss_clip": 1.02965391, + "balance_loss_mlp": 1.00396609, + "epoch": 0.5133169001382794, + "flos": 18515219132160.0, + "grad_norm": 1.8983167691004148, + "language_loss": 0.78879136, + "learning_rate": 2.0109055490195836e-06, + "loss": 0.81098068, + "num_input_tokens_seen": 92171470, + "step": 4269, + "time_per_iteration": 2.644163131713867 + }, + { + "auxiliary_loss_clip": 0.01095524, + "auxiliary_loss_mlp": 0.0108519, + "balance_loss_clip": 1.02536023, + "balance_loss_mlp": 1.00460446, + "epoch": 0.5134371430289184, + "flos": 15523537219200.0, + "grad_norm": 2.0186447522854496, + "language_loss": 0.6414842, + "learning_rate": 2.0101265881453605e-06, + "loss": 0.66329134, + "num_input_tokens_seen": 92189945, + "step": 4270, + "time_per_iteration": 2.825410842895508 + }, + { + "auxiliary_loss_clip": 0.01123781, + "auxiliary_loss_mlp": 0.01086472, + "balance_loss_clip": 1.02967346, + "balance_loss_mlp": 1.00607729, + "epoch": 0.5135573859195575, + "flos": 21433786911360.0, + "grad_norm": 1.9947579198696912, + "language_loss": 0.78261131, + "learning_rate": 2.009347625734941e-06, + "loss": 0.80471385, + "num_input_tokens_seen": 92209855, + "step": 4271, + "time_per_iteration": 2.7503323554992676 + }, + { + "auxiliary_loss_clip": 0.01143945, + "auxiliary_loss_mlp": 0.01087303, + "balance_loss_clip": 1.0314796, + "balance_loss_mlp": 1.00681329, + "epoch": 0.5136776288101966, + "flos": 17712651600000.0, + "grad_norm": 4.288092327435065, + "language_loss": 0.75042039, + "learning_rate": 2.0085686619064954e-06, + "loss": 0.77273285, + "num_input_tokens_seen": 92226295, + "step": 4272, + "time_per_iteration": 2.667026996612549 + }, + { + "auxiliary_loss_clip": 0.01135004, + "auxiliary_loss_mlp": 0.01085565, + "balance_loss_clip": 1.03126526, + "balance_loss_mlp": 1.00502753, + "epoch": 0.5137978717008357, + "flos": 16581680997120.0, + "grad_norm": 1.8899467816354873, + "language_loss": 0.83111882, + "learning_rate": 2.00778969677819e-06, + "loss": 0.85332453, + "num_input_tokens_seen": 92243330, + "step": 4273, + "time_per_iteration": 2.6403019428253174 + }, + { + "auxiliary_loss_clip": 0.01122603, + "auxiliary_loss_mlp": 0.01085092, + "balance_loss_clip": 1.02852988, + "balance_loss_mlp": 1.00450706, + "epoch": 0.5139181145914747, + "flos": 20668243322880.0, + "grad_norm": 1.6220292883091303, + "language_loss": 0.64248288, + "learning_rate": 2.0070107304681934e-06, + "loss": 0.66455984, + "num_input_tokens_seen": 92262285, + "step": 4274, + "time_per_iteration": 2.795283317565918 + }, + { + "auxiliary_loss_clip": 0.01110983, + "auxiliary_loss_mlp": 0.01083795, + "balance_loss_clip": 1.02716124, + "balance_loss_mlp": 1.0033524, + "epoch": 0.5140383574821139, + "flos": 32926996546560.0, + "grad_norm": 1.6453822522827786, + "language_loss": 0.78099573, + "learning_rate": 2.006231763094675e-06, + "loss": 0.80294347, + "num_input_tokens_seen": 92283305, + "step": 4275, + "time_per_iteration": 2.86568546295166 + }, + { + "auxiliary_loss_clip": 0.01122341, + "auxiliary_loss_mlp": 0.01084422, + "balance_loss_clip": 1.02954316, + "balance_loss_mlp": 1.00388432, + "epoch": 0.514158600372753, + "flos": 19537093152000.0, + "grad_norm": 1.8450907761144353, + "language_loss": 0.87541997, + "learning_rate": 2.0054527947758027e-06, + "loss": 0.89748764, + "num_input_tokens_seen": 92302105, + "step": 4276, + "time_per_iteration": 2.728649377822876 + }, + { + "auxiliary_loss_clip": 0.01116124, + "auxiliary_loss_mlp": 0.0107915, + "balance_loss_clip": 1.02791846, + "balance_loss_mlp": 1.0001862, + "epoch": 0.514278843263392, + "flos": 62523855279360.0, + "grad_norm": 0.7441083206704755, + "language_loss": 0.55934715, + "learning_rate": 2.004673825629746e-06, + "loss": 0.5812999, + "num_input_tokens_seen": 92362885, + "step": 4277, + "time_per_iteration": 3.22799015045166 + }, + { + "auxiliary_loss_clip": 0.01123902, + "auxiliary_loss_mlp": 0.01085254, + "balance_loss_clip": 1.02806735, + "balance_loss_mlp": 1.00471663, + "epoch": 0.5143990861540312, + "flos": 25882328545920.0, + "grad_norm": 2.1153023285762744, + "language_loss": 0.72156876, + "learning_rate": 2.0038948557746744e-06, + "loss": 0.74366033, + "num_input_tokens_seen": 92384740, + "step": 4278, + "time_per_iteration": 2.8395698070526123 + }, + { + "auxiliary_loss_clip": 0.01129977, + "auxiliary_loss_mlp": 0.01085235, + "balance_loss_clip": 1.02792645, + "balance_loss_mlp": 1.00469732, + "epoch": 0.5145193290446702, + "flos": 23330660238720.0, + "grad_norm": 1.5139049061206702, + "language_loss": 0.75149471, + "learning_rate": 2.0031158853287558e-06, + "loss": 0.77364683, + "num_input_tokens_seen": 92405175, + "step": 4279, + "time_per_iteration": 2.6713359355926514 + }, + { + "auxiliary_loss_clip": 0.01124318, + "auxiliary_loss_mlp": 0.01084757, + "balance_loss_clip": 1.0303452, + "balance_loss_mlp": 1.00417125, + "epoch": 0.5146395719353093, + "flos": 22856603518080.0, + "grad_norm": 2.091419492858016, + "language_loss": 0.70192325, + "learning_rate": 2.0023369144101593e-06, + "loss": 0.72401398, + "num_input_tokens_seen": 92423345, + "step": 4280, + "time_per_iteration": 2.700852632522583 + }, + { + "auxiliary_loss_clip": 0.01121053, + "auxiliary_loss_mlp": 0.01085189, + "balance_loss_clip": 1.02715993, + "balance_loss_mlp": 1.00460339, + "epoch": 0.5147598148259485, + "flos": 26391577616640.0, + "grad_norm": 2.0454220844687825, + "language_loss": 0.76564443, + "learning_rate": 2.0015579431370555e-06, + "loss": 0.78770685, + "num_input_tokens_seen": 92445025, + "step": 4281, + "time_per_iteration": 2.77567458152771 + }, + { + "auxiliary_loss_clip": 0.01131463, + "auxiliary_loss_mlp": 0.01085613, + "balance_loss_clip": 1.02935243, + "balance_loss_mlp": 1.00512314, + "epoch": 0.5148800577165875, + "flos": 29965694561280.0, + "grad_norm": 2.329758736201874, + "language_loss": 0.70030081, + "learning_rate": 2.000778971627612e-06, + "loss": 0.72247159, + "num_input_tokens_seen": 92464490, + "step": 4282, + "time_per_iteration": 2.709550142288208 + }, + { + "auxiliary_loss_clip": 0.01125852, + "auxiliary_loss_mlp": 0.01086086, + "balance_loss_clip": 1.02981281, + "balance_loss_mlp": 1.0055486, + "epoch": 0.5150003006072266, + "flos": 17931383470080.0, + "grad_norm": 1.993782068048364, + "language_loss": 0.90359634, + "learning_rate": 2e-06, + "loss": 0.92571574, + "num_input_tokens_seen": 92482085, + "step": 4283, + "time_per_iteration": 2.7620108127593994 + }, + { + "auxiliary_loss_clip": 0.01141654, + "auxiliary_loss_mlp": 0.01085747, + "balance_loss_clip": 1.02996933, + "balance_loss_mlp": 1.00516152, + "epoch": 0.5151205434978657, + "flos": 18478733892480.0, + "grad_norm": 1.6850512435422906, + "language_loss": 0.85342455, + "learning_rate": 1.9992210283723878e-06, + "loss": 0.87569857, + "num_input_tokens_seen": 92499325, + "step": 4284, + "time_per_iteration": 3.6187326908111572 + }, + { + "auxiliary_loss_clip": 0.01141718, + "auxiliary_loss_mlp": 0.01085491, + "balance_loss_clip": 1.03057384, + "balance_loss_mlp": 1.00514364, + "epoch": 0.5152407863885048, + "flos": 25341263003520.0, + "grad_norm": 2.219440557493211, + "language_loss": 0.79355359, + "learning_rate": 1.9984420568629448e-06, + "loss": 0.8158257, + "num_input_tokens_seen": 92522090, + "step": 4285, + "time_per_iteration": 2.692197799682617 + }, + { + "auxiliary_loss_clip": 0.01133269, + "auxiliary_loss_mlp": 0.01085047, + "balance_loss_clip": 1.02960968, + "balance_loss_mlp": 1.00460458, + "epoch": 0.5153610292791438, + "flos": 18329740277760.0, + "grad_norm": 1.959602904511312, + "language_loss": 0.7857579, + "learning_rate": 1.9976630855898405e-06, + "loss": 0.80794108, + "num_input_tokens_seen": 92539845, + "step": 4286, + "time_per_iteration": 2.6735334396362305 + }, + { + "auxiliary_loss_clip": 0.01122765, + "auxiliary_loss_mlp": 0.01083111, + "balance_loss_clip": 1.02762043, + "balance_loss_mlp": 1.00271583, + "epoch": 0.515481272169783, + "flos": 30409945971840.0, + "grad_norm": 1.9786862744391642, + "language_loss": 0.74373114, + "learning_rate": 1.9968841146712445e-06, + "loss": 0.76578987, + "num_input_tokens_seen": 92559460, + "step": 4287, + "time_per_iteration": 2.738145112991333 + }, + { + "auxiliary_loss_clip": 0.01093007, + "auxiliary_loss_mlp": 0.00873027, + "balance_loss_clip": 1.02534294, + "balance_loss_mlp": 1.00005901, + "epoch": 0.5156015150604221, + "flos": 23037305863680.0, + "grad_norm": 1.482117624979784, + "language_loss": 0.71373719, + "learning_rate": 1.996105144225326e-06, + "loss": 0.73339754, + "num_input_tokens_seen": 92579695, + "step": 4288, + "time_per_iteration": 3.898122787475586 + }, + { + "auxiliary_loss_clip": 0.01131573, + "auxiliary_loss_mlp": 0.01085518, + "balance_loss_clip": 1.02863097, + "balance_loss_mlp": 1.00507522, + "epoch": 0.5157217579510611, + "flos": 17858556645120.0, + "grad_norm": 1.9044080138334794, + "language_loss": 0.78719056, + "learning_rate": 1.995326174370254e-06, + "loss": 0.80936146, + "num_input_tokens_seen": 92598795, + "step": 4289, + "time_per_iteration": 2.737988233566284 + }, + { + "auxiliary_loss_clip": 0.01131567, + "auxiliary_loss_mlp": 0.00872888, + "balance_loss_clip": 1.02830505, + "balance_loss_mlp": 1.00011992, + "epoch": 0.5158420008417003, + "flos": 19171486569600.0, + "grad_norm": 1.4991724264399022, + "language_loss": 0.72741389, + "learning_rate": 1.994547205224197e-06, + "loss": 0.74745846, + "num_input_tokens_seen": 92617700, + "step": 4290, + "time_per_iteration": 2.706113815307617 + }, + { + "auxiliary_loss_clip": 0.01114074, + "auxiliary_loss_mlp": 0.01084598, + "balance_loss_clip": 1.02727532, + "balance_loss_mlp": 1.00410748, + "epoch": 0.5159622437323393, + "flos": 22419534827520.0, + "grad_norm": 1.9885672081858992, + "language_loss": 0.67569488, + "learning_rate": 1.993768236905325e-06, + "loss": 0.69768155, + "num_input_tokens_seen": 92638370, + "step": 4291, + "time_per_iteration": 3.6198437213897705 + }, + { + "auxiliary_loss_clip": 0.011229, + "auxiliary_loss_mlp": 0.01085105, + "balance_loss_clip": 1.02827835, + "balance_loss_mlp": 1.00456738, + "epoch": 0.5160824866229784, + "flos": 24603010773120.0, + "grad_norm": 2.1073958948546863, + "language_loss": 0.65903282, + "learning_rate": 1.992989269531807e-06, + "loss": 0.68111277, + "num_input_tokens_seen": 92657180, + "step": 4292, + "time_per_iteration": 2.8089699745178223 + }, + { + "auxiliary_loss_clip": 0.01123396, + "auxiliary_loss_mlp": 0.01085161, + "balance_loss_clip": 1.02863169, + "balance_loss_mlp": 1.00462365, + "epoch": 0.5162027295136175, + "flos": 18002737837440.0, + "grad_norm": 6.9115035479109155, + "language_loss": 0.68443143, + "learning_rate": 1.99221030322181e-06, + "loss": 0.70651698, + "num_input_tokens_seen": 92673985, + "step": 4293, + "time_per_iteration": 2.8561272621154785 + }, + { + "auxiliary_loss_clip": 0.01125002, + "auxiliary_loss_mlp": 0.01086436, + "balance_loss_clip": 1.02972364, + "balance_loss_mlp": 1.00604153, + "epoch": 0.5163229724042566, + "flos": 27344611221120.0, + "grad_norm": 1.531629123315205, + "language_loss": 0.81150681, + "learning_rate": 1.991431338093505e-06, + "loss": 0.83362114, + "num_input_tokens_seen": 92696340, + "step": 4294, + "time_per_iteration": 2.814134359359741 + }, + { + "auxiliary_loss_clip": 0.01122691, + "auxiliary_loss_mlp": 0.01085328, + "balance_loss_clip": 1.02910638, + "balance_loss_mlp": 1.00502825, + "epoch": 0.5164432152948957, + "flos": 21762764599680.0, + "grad_norm": 1.7159518112698318, + "language_loss": 0.79478586, + "learning_rate": 1.9906523742650587e-06, + "loss": 0.81686604, + "num_input_tokens_seen": 92715200, + "step": 4295, + "time_per_iteration": 2.7582221031188965 + }, + { + "auxiliary_loss_clip": 0.0113996, + "auxiliary_loss_mlp": 0.0108636, + "balance_loss_clip": 1.0279727, + "balance_loss_mlp": 1.0057745, + "epoch": 0.5165634581855347, + "flos": 25550334115200.0, + "grad_norm": 1.7167283138427398, + "language_loss": 0.77721113, + "learning_rate": 1.9898734118546397e-06, + "loss": 0.79947436, + "num_input_tokens_seen": 92735150, + "step": 4296, + "time_per_iteration": 2.7240970134735107 + }, + { + "auxiliary_loss_clip": 0.01084499, + "auxiliary_loss_mlp": 0.01085058, + "balance_loss_clip": 1.02531409, + "balance_loss_mlp": 1.00456762, + "epoch": 0.5166837010761739, + "flos": 19901191363200.0, + "grad_norm": 1.6410659717635558, + "language_loss": 0.80466819, + "learning_rate": 1.989094450980416e-06, + "loss": 0.82636374, + "num_input_tokens_seen": 92755250, + "step": 4297, + "time_per_iteration": 2.912893295288086 + }, + { + "auxiliary_loss_clip": 0.01130223, + "auxiliary_loss_mlp": 0.0108532, + "balance_loss_clip": 1.02852511, + "balance_loss_mlp": 1.00482988, + "epoch": 0.516803943966813, + "flos": 26646076454400.0, + "grad_norm": 1.951551478832176, + "language_loss": 0.76293647, + "learning_rate": 1.9883154917605556e-06, + "loss": 0.78509188, + "num_input_tokens_seen": 92774460, + "step": 4298, + "time_per_iteration": 2.7511141300201416 + }, + { + "auxiliary_loss_clip": 0.01141711, + "auxiliary_loss_mlp": 0.01084993, + "balance_loss_clip": 1.03010559, + "balance_loss_mlp": 1.00469327, + "epoch": 0.516924186857452, + "flos": 19682854542720.0, + "grad_norm": 2.5084674885589644, + "language_loss": 0.83268797, + "learning_rate": 1.9875365343132262e-06, + "loss": 0.85495502, + "num_input_tokens_seen": 92791580, + "step": 4299, + "time_per_iteration": 2.606018304824829 + }, + { + "auxiliary_loss_clip": 0.01132347, + "auxiliary_loss_mlp": 0.00872927, + "balance_loss_clip": 1.02967322, + "balance_loss_mlp": 1.000121, + "epoch": 0.5170444297480912, + "flos": 15956583586560.0, + "grad_norm": 2.0001174935252197, + "language_loss": 0.8520481, + "learning_rate": 1.9867575787565946e-06, + "loss": 0.87210077, + "num_input_tokens_seen": 92806240, + "step": 4300, + "time_per_iteration": 2.6618728637695312 + }, + { + "auxiliary_loss_clip": 0.01131856, + "auxiliary_loss_mlp": 0.01085872, + "balance_loss_clip": 1.02923203, + "balance_loss_mlp": 1.00533402, + "epoch": 0.5171646726387302, + "flos": 14174157968640.0, + "grad_norm": 2.496102422506017, + "language_loss": 0.86093485, + "learning_rate": 1.9859786252088275e-06, + "loss": 0.88311207, + "num_input_tokens_seen": 92823420, + "step": 4301, + "time_per_iteration": 2.620778799057007 + }, + { + "auxiliary_loss_clip": 0.0111123, + "auxiliary_loss_mlp": 0.01085547, + "balance_loss_clip": 1.02663529, + "balance_loss_mlp": 1.00486624, + "epoch": 0.5172849155293693, + "flos": 23578550974080.0, + "grad_norm": 3.9787602389655867, + "language_loss": 0.66907847, + "learning_rate": 1.9851996737880914e-06, + "loss": 0.69104624, + "num_input_tokens_seen": 92838605, + "step": 4302, + "time_per_iteration": 2.752673387527466 + }, + { + "auxiliary_loss_clip": 0.01133515, + "auxiliary_loss_mlp": 0.01085472, + "balance_loss_clip": 1.02990937, + "balance_loss_mlp": 1.00493479, + "epoch": 0.5174051584200084, + "flos": 14283541860480.0, + "grad_norm": 2.066596346561316, + "language_loss": 0.74252802, + "learning_rate": 1.9844207246125537e-06, + "loss": 0.76471794, + "num_input_tokens_seen": 92855185, + "step": 4303, + "time_per_iteration": 2.6639089584350586 + }, + { + "auxiliary_loss_clip": 0.01126821, + "auxiliary_loss_mlp": 0.01084624, + "balance_loss_clip": 1.0318023, + "balance_loss_mlp": 1.00422895, + "epoch": 0.5175254013106475, + "flos": 37889384192640.0, + "grad_norm": 1.7222575341539488, + "language_loss": 0.68545109, + "learning_rate": 1.983641777800379e-06, + "loss": 0.70756555, + "num_input_tokens_seen": 92877830, + "step": 4304, + "time_per_iteration": 2.9115803241729736 + }, + { + "auxiliary_loss_clip": 0.01110065, + "auxiliary_loss_mlp": 0.01079086, + "balance_loss_clip": 1.02906191, + "balance_loss_mlp": 1.00012195, + "epoch": 0.5176456442012866, + "flos": 68549737829760.0, + "grad_norm": 0.7400019411256932, + "language_loss": 0.58785516, + "learning_rate": 1.9828628334697343e-06, + "loss": 0.60974663, + "num_input_tokens_seen": 92945040, + "step": 4305, + "time_per_iteration": 3.42141056060791 + }, + { + "auxiliary_loss_clip": 0.01109271, + "auxiliary_loss_mlp": 0.0107972, + "balance_loss_clip": 1.02866435, + "balance_loss_mlp": 1.00037408, + "epoch": 0.5177658870919257, + "flos": 64084137235200.0, + "grad_norm": 0.9329717497414373, + "language_loss": 0.54728168, + "learning_rate": 1.982083891738784e-06, + "loss": 0.56917161, + "num_input_tokens_seen": 93005910, + "step": 4306, + "time_per_iteration": 3.3177781105041504 + }, + { + "auxiliary_loss_clip": 0.01121815, + "auxiliary_loss_mlp": 0.01084985, + "balance_loss_clip": 1.02946615, + "balance_loss_mlp": 1.00459015, + "epoch": 0.5178861299825648, + "flos": 26651248012800.0, + "grad_norm": 1.5004012397308004, + "language_loss": 0.82833803, + "learning_rate": 1.9813049527256923e-06, + "loss": 0.85040593, + "num_input_tokens_seen": 93026305, + "step": 4307, + "time_per_iteration": 2.7503204345703125 + }, + { + "auxiliary_loss_clip": 0.01115855, + "auxiliary_loss_mlp": 0.01085654, + "balance_loss_clip": 1.02867246, + "balance_loss_mlp": 1.0050211, + "epoch": 0.5180063728732038, + "flos": 17931886260480.0, + "grad_norm": 2.2756049765576467, + "language_loss": 0.82355547, + "learning_rate": 1.9805260165486252e-06, + "loss": 0.8455705, + "num_input_tokens_seen": 93045675, + "step": 4308, + "time_per_iteration": 2.747493028640747 + }, + { + "auxiliary_loss_clip": 0.01133259, + "auxiliary_loss_mlp": 0.01084816, + "balance_loss_clip": 1.03036523, + "balance_loss_mlp": 1.00432611, + "epoch": 0.518126615763843, + "flos": 19500895221120.0, + "grad_norm": 1.8084144420257062, + "language_loss": 0.86398584, + "learning_rate": 1.9797470833257457e-06, + "loss": 0.88616657, + "num_input_tokens_seen": 93065375, + "step": 4309, + "time_per_iteration": 2.7125601768493652 + }, + { + "auxiliary_loss_clip": 0.01130779, + "auxiliary_loss_mlp": 0.01086749, + "balance_loss_clip": 1.02892137, + "balance_loss_mlp": 1.0062592, + "epoch": 0.5182468586544821, + "flos": 20704082117760.0, + "grad_norm": 2.9581646883604837, + "language_loss": 0.77272117, + "learning_rate": 1.9789681531752177e-06, + "loss": 0.79489636, + "num_input_tokens_seen": 93085595, + "step": 4310, + "time_per_iteration": 3.5418899059295654 + }, + { + "auxiliary_loss_clip": 0.01101486, + "auxiliary_loss_mlp": 0.01085171, + "balance_loss_clip": 1.0263474, + "balance_loss_mlp": 1.0048244, + "epoch": 0.5183671015451211, + "flos": 23112107936640.0, + "grad_norm": 1.4824497977747817, + "language_loss": 0.72769308, + "learning_rate": 1.978189226215204e-06, + "loss": 0.74955964, + "num_input_tokens_seen": 93106140, + "step": 4311, + "time_per_iteration": 2.789714813232422 + }, + { + "auxiliary_loss_clip": 0.01141991, + "auxiliary_loss_mlp": 0.01084889, + "balance_loss_clip": 1.03037071, + "balance_loss_mlp": 1.0043993, + "epoch": 0.5184873444357603, + "flos": 17597090568960.0, + "grad_norm": 1.7680652508288126, + "language_loss": 0.77183557, + "learning_rate": 1.9774103025638675e-06, + "loss": 0.7941044, + "num_input_tokens_seen": 93124265, + "step": 4312, + "time_per_iteration": 2.648526906967163 + }, + { + "auxiliary_loss_clip": 0.0110276, + "auxiliary_loss_mlp": 0.01085229, + "balance_loss_clip": 1.02712226, + "balance_loss_mlp": 1.00478697, + "epoch": 0.5186075873263993, + "flos": 24936800883840.0, + "grad_norm": 1.5803093005066737, + "language_loss": 0.7627542, + "learning_rate": 1.9766313823393696e-06, + "loss": 0.78463411, + "num_input_tokens_seen": 93145130, + "step": 4313, + "time_per_iteration": 2.8850667476654053 + }, + { + "auxiliary_loss_clip": 0.01105786, + "auxiliary_loss_mlp": 0.01084828, + "balance_loss_clip": 1.02874112, + "balance_loss_mlp": 1.00438511, + "epoch": 0.5187278302170384, + "flos": 15190106244480.0, + "grad_norm": 2.0926670325186336, + "language_loss": 0.69022363, + "learning_rate": 1.975852465659873e-06, + "loss": 0.71212983, + "num_input_tokens_seen": 93161110, + "step": 4314, + "time_per_iteration": 3.6867308616638184 + }, + { + "auxiliary_loss_clip": 0.01131503, + "auxiliary_loss_mlp": 0.01085569, + "balance_loss_clip": 1.02862251, + "balance_loss_mlp": 1.00507867, + "epoch": 0.5188480731076776, + "flos": 25009412227200.0, + "grad_norm": 6.460598330697425, + "language_loss": 0.70227319, + "learning_rate": 1.9750735526435377e-06, + "loss": 0.72444391, + "num_input_tokens_seen": 93178055, + "step": 4315, + "time_per_iteration": 2.7097692489624023 + }, + { + "auxiliary_loss_clip": 0.01116217, + "auxiliary_loss_mlp": 0.01084854, + "balance_loss_clip": 1.02451921, + "balance_loss_mlp": 1.0044117, + "epoch": 0.5189683159983166, + "flos": 24790141653120.0, + "grad_norm": 2.5850420490401373, + "language_loss": 0.79324436, + "learning_rate": 1.974294643408525e-06, + "loss": 0.81525511, + "num_input_tokens_seen": 93195850, + "step": 4316, + "time_per_iteration": 3.7482051849365234 + }, + { + "auxiliary_loss_clip": 0.01133205, + "auxiliary_loss_mlp": 0.01085476, + "balance_loss_clip": 1.02942431, + "balance_loss_mlp": 1.00484335, + "epoch": 0.5190885588889557, + "flos": 24754266944640.0, + "grad_norm": 1.7539169624159086, + "language_loss": 0.66767371, + "learning_rate": 1.9735157380729947e-06, + "loss": 0.68986046, + "num_input_tokens_seen": 93216260, + "step": 4317, + "time_per_iteration": 2.7101874351501465 + }, + { + "auxiliary_loss_clip": 0.01123777, + "auxiliary_loss_mlp": 0.01085487, + "balance_loss_clip": 1.02921569, + "balance_loss_mlp": 1.00509202, + "epoch": 0.5192088017795948, + "flos": 24712646060160.0, + "grad_norm": 1.8417401534783076, + "language_loss": 0.8396346, + "learning_rate": 1.9727368367551053e-06, + "loss": 0.86172724, + "num_input_tokens_seen": 93234810, + "step": 4318, + "time_per_iteration": 2.7575221061706543 + }, + { + "auxiliary_loss_clip": 0.01116544, + "auxiliary_loss_mlp": 0.01084023, + "balance_loss_clip": 1.02730799, + "balance_loss_mlp": 1.00358057, + "epoch": 0.5193290446702339, + "flos": 27229588894080.0, + "grad_norm": 1.9424119272124516, + "language_loss": 0.68412066, + "learning_rate": 1.9719579395730164e-06, + "loss": 0.70612633, + "num_input_tokens_seen": 93254185, + "step": 4319, + "time_per_iteration": 2.793025016784668 + }, + { + "auxiliary_loss_clip": 0.01142868, + "auxiliary_loss_mlp": 0.01085917, + "balance_loss_clip": 1.03126025, + "balance_loss_mlp": 1.00528359, + "epoch": 0.5194492875608729, + "flos": 11473352392320.0, + "grad_norm": 2.183388300654653, + "language_loss": 0.93481159, + "learning_rate": 1.9711790466448854e-06, + "loss": 0.95709944, + "num_input_tokens_seen": 93268205, + "step": 4320, + "time_per_iteration": 2.6537818908691406 + }, + { + "auxiliary_loss_clip": 0.0110272, + "auxiliary_loss_mlp": 0.01085939, + "balance_loss_clip": 1.02744055, + "balance_loss_mlp": 1.00530601, + "epoch": 0.5195695304515121, + "flos": 20338906498560.0, + "grad_norm": 4.40061994640956, + "language_loss": 0.71564829, + "learning_rate": 1.9704001580888704e-06, + "loss": 0.73753488, + "num_input_tokens_seen": 93286945, + "step": 4321, + "time_per_iteration": 2.8271100521087646 + }, + { + "auxiliary_loss_clip": 0.01120552, + "auxiliary_loss_mlp": 0.00873072, + "balance_loss_clip": 1.0269506, + "balance_loss_mlp": 1.00012279, + "epoch": 0.5196897733421512, + "flos": 20048317470720.0, + "grad_norm": 1.7107120490072147, + "language_loss": 0.86610329, + "learning_rate": 1.9696212740231283e-06, + "loss": 0.88603956, + "num_input_tokens_seen": 93305595, + "step": 4322, + "time_per_iteration": 2.7537829875946045 + }, + { + "auxiliary_loss_clip": 0.01132655, + "auxiliary_loss_mlp": 0.01085793, + "balance_loss_clip": 1.02878332, + "balance_loss_mlp": 1.00501692, + "epoch": 0.5198100162327902, + "flos": 23805507058560.0, + "grad_norm": 1.9227077594032118, + "language_loss": 0.81916457, + "learning_rate": 1.9688423945658146e-06, + "loss": 0.84134907, + "num_input_tokens_seen": 93326460, + "step": 4323, + "time_per_iteration": 2.669093132019043 + }, + { + "auxiliary_loss_clip": 0.01106455, + "auxiliary_loss_mlp": 0.01085918, + "balance_loss_clip": 1.02779126, + "balance_loss_mlp": 1.0051899, + "epoch": 0.5199302591234293, + "flos": 24023951619840.0, + "grad_norm": 2.173390087974897, + "language_loss": 0.71624583, + "learning_rate": 1.9680635198350845e-06, + "loss": 0.73816961, + "num_input_tokens_seen": 93346170, + "step": 4324, + "time_per_iteration": 2.8756959438323975 + }, + { + "auxiliary_loss_clip": 0.01132774, + "auxiliary_loss_mlp": 0.0108558, + "balance_loss_clip": 1.02892804, + "balance_loss_mlp": 1.00494671, + "epoch": 0.5200505020140684, + "flos": 26359366095360.0, + "grad_norm": 1.9652323245951495, + "language_loss": 0.72389531, + "learning_rate": 1.967284649949093e-06, + "loss": 0.74607885, + "num_input_tokens_seen": 93365380, + "step": 4325, + "time_per_iteration": 2.6783881187438965 + }, + { + "auxiliary_loss_clip": 0.01108837, + "auxiliary_loss_mlp": 0.01085856, + "balance_loss_clip": 1.02417755, + "balance_loss_mlp": 1.00536597, + "epoch": 0.5201707449047075, + "flos": 39604262284800.0, + "grad_norm": 2.1601184619999962, + "language_loss": 0.72436345, + "learning_rate": 1.966505785025994e-06, + "loss": 0.74631041, + "num_input_tokens_seen": 93387285, + "step": 4326, + "time_per_iteration": 2.937399387359619 + }, + { + "auxiliary_loss_clip": 0.01091537, + "auxiliary_loss_mlp": 0.01085464, + "balance_loss_clip": 1.02777028, + "balance_loss_mlp": 1.00487876, + "epoch": 0.5202909877953465, + "flos": 53682788292480.0, + "grad_norm": 1.8144419654101835, + "language_loss": 0.75883174, + "learning_rate": 1.965726925183941e-06, + "loss": 0.7806018, + "num_input_tokens_seen": 93410390, + "step": 4327, + "time_per_iteration": 3.066448926925659 + }, + { + "auxiliary_loss_clip": 0.01142419, + "auxiliary_loss_mlp": 0.01084529, + "balance_loss_clip": 1.0309453, + "balance_loss_mlp": 1.00394392, + "epoch": 0.5204112306859857, + "flos": 19537021324800.0, + "grad_norm": 1.7935521510019172, + "language_loss": 0.84244436, + "learning_rate": 1.964948070541087e-06, + "loss": 0.86471391, + "num_input_tokens_seen": 93429050, + "step": 4328, + "time_per_iteration": 2.5709855556488037 + }, + { + "auxiliary_loss_clip": 0.01132227, + "auxiliary_loss_mlp": 0.01084972, + "balance_loss_clip": 1.02871799, + "balance_loss_mlp": 1.00443423, + "epoch": 0.5205314735766248, + "flos": 15304697608320.0, + "grad_norm": 2.41855839794208, + "language_loss": 0.69606477, + "learning_rate": 1.9641692212155816e-06, + "loss": 0.7182368, + "num_input_tokens_seen": 93446815, + "step": 4329, + "time_per_iteration": 2.699368953704834 + }, + { + "auxiliary_loss_clip": 0.01098978, + "auxiliary_loss_mlp": 0.01084599, + "balance_loss_clip": 1.02553165, + "balance_loss_mlp": 1.00425208, + "epoch": 0.5206517164672638, + "flos": 59263701160320.0, + "grad_norm": 1.8166227950143095, + "language_loss": 0.72690535, + "learning_rate": 1.9633903773255777e-06, + "loss": 0.74874109, + "num_input_tokens_seen": 93469130, + "step": 4330, + "time_per_iteration": 3.087578058242798 + }, + { + "auxiliary_loss_clip": 0.0114125, + "auxiliary_loss_mlp": 0.01084686, + "balance_loss_clip": 1.02938104, + "balance_loss_mlp": 1.00414872, + "epoch": 0.520771959357903, + "flos": 26871129118080.0, + "grad_norm": 1.5472273120938906, + "language_loss": 0.74834186, + "learning_rate": 1.9626115389892237e-06, + "loss": 0.77060121, + "num_input_tokens_seen": 93489920, + "step": 4331, + "time_per_iteration": 2.655357599258423 + }, + { + "auxiliary_loss_clip": 0.01099573, + "auxiliary_loss_mlp": 0.01084959, + "balance_loss_clip": 1.02984238, + "balance_loss_mlp": 1.00446844, + "epoch": 0.520892202248542, + "flos": 26907075653760.0, + "grad_norm": 1.8459175973227329, + "language_loss": 0.85378826, + "learning_rate": 1.96183270632467e-06, + "loss": 0.87563354, + "num_input_tokens_seen": 93509770, + "step": 4332, + "time_per_iteration": 2.7661900520324707 + }, + { + "auxiliary_loss_clip": 0.01091685, + "auxiliary_loss_mlp": 0.00873222, + "balance_loss_clip": 1.02738154, + "balance_loss_mlp": 1.00015652, + "epoch": 0.5210124451391811, + "flos": 25849434666240.0, + "grad_norm": 1.6824212079199412, + "language_loss": 0.78767365, + "learning_rate": 1.9610538794500644e-06, + "loss": 0.80732268, + "num_input_tokens_seen": 93529320, + "step": 4333, + "time_per_iteration": 2.815241813659668 + }, + { + "auxiliary_loss_clip": 0.01100416, + "auxiliary_loss_mlp": 0.01079489, + "balance_loss_clip": 1.02805352, + "balance_loss_mlp": 1.00014305, + "epoch": 0.5211326880298203, + "flos": 70553804319360.0, + "grad_norm": 0.7725605208485354, + "language_loss": 0.59422821, + "learning_rate": 1.9602750584835542e-06, + "loss": 0.61602724, + "num_input_tokens_seen": 93595255, + "step": 4334, + "time_per_iteration": 3.42910099029541 + }, + { + "auxiliary_loss_clip": 0.01125735, + "auxiliary_loss_mlp": 0.01085216, + "balance_loss_clip": 1.03003025, + "balance_loss_mlp": 1.00482118, + "epoch": 0.5212529309204593, + "flos": 15628898787840.0, + "grad_norm": 1.9223128960901088, + "language_loss": 0.82669532, + "learning_rate": 1.959496243543286e-06, + "loss": 0.84880483, + "num_input_tokens_seen": 93613135, + "step": 4335, + "time_per_iteration": 3.5843217372894287 + }, + { + "auxiliary_loss_clip": 0.01134801, + "auxiliary_loss_mlp": 0.01086753, + "balance_loss_clip": 1.03197968, + "balance_loss_mlp": 1.00621533, + "epoch": 0.5213731738110984, + "flos": 26242655829120.0, + "grad_norm": 1.9504414940989991, + "language_loss": 0.79274601, + "learning_rate": 1.9587174347474057e-06, + "loss": 0.81496155, + "num_input_tokens_seen": 93629645, + "step": 4336, + "time_per_iteration": 3.5348124504089355 + }, + { + "auxiliary_loss_clip": 0.0110263, + "auxiliary_loss_mlp": 0.01084979, + "balance_loss_clip": 1.02622914, + "balance_loss_mlp": 1.00444078, + "epoch": 0.5214934167017375, + "flos": 19418407637760.0, + "grad_norm": 2.430841663975313, + "language_loss": 0.8183006, + "learning_rate": 1.9579386322140574e-06, + "loss": 0.8401767, + "num_input_tokens_seen": 93645325, + "step": 4337, + "time_per_iteration": 2.7124216556549072 + }, + { + "auxiliary_loss_clip": 0.01143069, + "auxiliary_loss_mlp": 0.00873059, + "balance_loss_clip": 1.03071737, + "balance_loss_mlp": 1.00011301, + "epoch": 0.5216136595923766, + "flos": 30955788023040.0, + "grad_norm": 1.754507244827867, + "language_loss": 0.80916184, + "learning_rate": 1.9571598360613854e-06, + "loss": 0.82932317, + "num_input_tokens_seen": 93668200, + "step": 4338, + "time_per_iteration": 2.7342779636383057 + }, + { + "auxiliary_loss_clip": 0.01124162, + "auxiliary_loss_mlp": 0.01084941, + "balance_loss_clip": 1.02839589, + "balance_loss_mlp": 1.00440288, + "epoch": 0.5217339024830157, + "flos": 21945047143680.0, + "grad_norm": 2.0460199805559722, + "language_loss": 0.69833076, + "learning_rate": 1.956381046407532e-06, + "loss": 0.72042179, + "num_input_tokens_seen": 93688495, + "step": 4339, + "time_per_iteration": 3.7126641273498535 + }, + { + "auxiliary_loss_clip": 0.0111462, + "auxiliary_loss_mlp": 0.01085667, + "balance_loss_clip": 1.02782035, + "balance_loss_mlp": 1.00527215, + "epoch": 0.5218541453736548, + "flos": 20923209037440.0, + "grad_norm": 1.5698030060805475, + "language_loss": 0.86113828, + "learning_rate": 1.9556022633706394e-06, + "loss": 0.88314116, + "num_input_tokens_seen": 93707285, + "step": 4340, + "time_per_iteration": 2.8160343170166016 + }, + { + "auxiliary_loss_clip": 0.01122427, + "auxiliary_loss_mlp": 0.01084093, + "balance_loss_clip": 1.02818727, + "balance_loss_mlp": 1.00355518, + "epoch": 0.5219743882642939, + "flos": 23951663498880.0, + "grad_norm": 1.772424709532137, + "language_loss": 0.80148196, + "learning_rate": 1.954823487068848e-06, + "loss": 0.82354718, + "num_input_tokens_seen": 93727495, + "step": 4341, + "time_per_iteration": 3.685363531112671 + }, + { + "auxiliary_loss_clip": 0.01132621, + "auxiliary_loss_mlp": 0.01086277, + "balance_loss_clip": 1.03047204, + "balance_loss_mlp": 1.00569105, + "epoch": 0.5220946311549329, + "flos": 28799280213120.0, + "grad_norm": 1.9013839105418817, + "language_loss": 0.8121534, + "learning_rate": 1.9540447176202976e-06, + "loss": 0.8343423, + "num_input_tokens_seen": 93748740, + "step": 4342, + "time_per_iteration": 2.750192880630493 + }, + { + "auxiliary_loss_clip": 0.01114585, + "auxiliary_loss_mlp": 0.01079689, + "balance_loss_clip": 1.0267328, + "balance_loss_mlp": 1.0003432, + "epoch": 0.5222148740455721, + "flos": 67189369017600.0, + "grad_norm": 0.8697072467446496, + "language_loss": 0.60788918, + "learning_rate": 1.9532659551431272e-06, + "loss": 0.62983191, + "num_input_tokens_seen": 93815770, + "step": 4343, + "time_per_iteration": 3.381422519683838 + }, + { + "auxiliary_loss_clip": 0.01133011, + "auxiliary_loss_mlp": 0.01084273, + "balance_loss_clip": 1.02945805, + "balance_loss_mlp": 1.00383091, + "epoch": 0.5223351169362112, + "flos": 61856164339200.0, + "grad_norm": 1.5800145442908473, + "language_loss": 0.67704719, + "learning_rate": 1.9524871997554744e-06, + "loss": 0.69922006, + "num_input_tokens_seen": 93843530, + "step": 4344, + "time_per_iteration": 3.0410943031311035 + }, + { + "auxiliary_loss_clip": 0.01131663, + "auxiliary_loss_mlp": 0.01085461, + "balance_loss_clip": 1.0288105, + "balance_loss_mlp": 1.0048753, + "epoch": 0.5224553598268502, + "flos": 14647388676480.0, + "grad_norm": 2.041702533016871, + "language_loss": 0.80477726, + "learning_rate": 1.951708451575475e-06, + "loss": 0.8269484, + "num_input_tokens_seen": 93860595, + "step": 4345, + "time_per_iteration": 2.6430130004882812 + }, + { + "auxiliary_loss_clip": 0.01115797, + "auxiliary_loss_mlp": 0.01085119, + "balance_loss_clip": 1.02828133, + "balance_loss_mlp": 1.00462842, + "epoch": 0.5225756027174894, + "flos": 14826043946880.0, + "grad_norm": 2.182933052520282, + "language_loss": 0.82275999, + "learning_rate": 1.9509297107212657e-06, + "loss": 0.84476918, + "num_input_tokens_seen": 93877365, + "step": 4346, + "time_per_iteration": 2.79007625579834 + }, + { + "auxiliary_loss_clip": 0.01141154, + "auxiliary_loss_mlp": 0.01083832, + "balance_loss_clip": 1.02965832, + "balance_loss_mlp": 1.00338936, + "epoch": 0.5226958456081284, + "flos": 23512009029120.0, + "grad_norm": 1.5517008497175018, + "language_loss": 0.79086947, + "learning_rate": 1.95015097731098e-06, + "loss": 0.81311929, + "num_input_tokens_seen": 93896855, + "step": 4347, + "time_per_iteration": 2.715118169784546 + }, + { + "auxiliary_loss_clip": 0.01141689, + "auxiliary_loss_mlp": 0.01084442, + "balance_loss_clip": 1.02985048, + "balance_loss_mlp": 1.00399995, + "epoch": 0.5228160884987675, + "flos": 19062928690560.0, + "grad_norm": 1.9814748679176593, + "language_loss": 0.81949645, + "learning_rate": 1.949372251462751e-06, + "loss": 0.84175777, + "num_input_tokens_seen": 93914270, + "step": 4348, + "time_per_iteration": 2.6224734783172607 + }, + { + "auxiliary_loss_clip": 0.01113856, + "auxiliary_loss_mlp": 0.00872932, + "balance_loss_clip": 1.02824795, + "balance_loss_mlp": 1.0000813, + "epoch": 0.5229363313894067, + "flos": 21063224252160.0, + "grad_norm": 1.6664391836944186, + "language_loss": 0.82912683, + "learning_rate": 1.9485935332947124e-06, + "loss": 0.84899467, + "num_input_tokens_seen": 93932180, + "step": 4349, + "time_per_iteration": 2.870022773742676 + }, + { + "auxiliary_loss_clip": 0.01114926, + "auxiliary_loss_mlp": 0.01084357, + "balance_loss_clip": 1.02715695, + "balance_loss_mlp": 1.0039624, + "epoch": 0.5230565742800457, + "flos": 14830389492480.0, + "grad_norm": 4.374150338200736, + "language_loss": 0.83356535, + "learning_rate": 1.947814822924993e-06, + "loss": 0.85555816, + "num_input_tokens_seen": 93949690, + "step": 4350, + "time_per_iteration": 2.707268714904785 + }, + { + "auxiliary_loss_clip": 0.01140989, + "auxiliary_loss_mlp": 0.01086006, + "balance_loss_clip": 1.02949333, + "balance_loss_mlp": 1.0055635, + "epoch": 0.5231768171706848, + "flos": 25813021253760.0, + "grad_norm": 1.7302572274053116, + "language_loss": 0.82965696, + "learning_rate": 1.9470361204717236e-06, + "loss": 0.85192692, + "num_input_tokens_seen": 93968830, + "step": 4351, + "time_per_iteration": 2.6982078552246094 + }, + { + "auxiliary_loss_clip": 0.01114406, + "auxiliary_loss_mlp": 0.00873023, + "balance_loss_clip": 1.02805483, + "balance_loss_mlp": 1.0000813, + "epoch": 0.5232970600613239, + "flos": 22743807834240.0, + "grad_norm": 1.5328928454975643, + "language_loss": 0.80720806, + "learning_rate": 1.9462574260530326e-06, + "loss": 0.82708234, + "num_input_tokens_seen": 93989110, + "step": 4352, + "time_per_iteration": 2.786750555038452 + }, + { + "auxiliary_loss_clip": 0.01132092, + "auxiliary_loss_mlp": 0.01085208, + "balance_loss_clip": 1.02886128, + "balance_loss_mlp": 1.00467014, + "epoch": 0.523417302951963, + "flos": 17310703432320.0, + "grad_norm": 1.8960762715827006, + "language_loss": 0.80986822, + "learning_rate": 1.9454787397870472e-06, + "loss": 0.83204126, + "num_input_tokens_seen": 94006430, + "step": 4353, + "time_per_iteration": 2.7106404304504395 + }, + { + "auxiliary_loss_clip": 0.01070852, + "auxiliary_loss_mlp": 0.0108479, + "balance_loss_clip": 1.02229905, + "balance_loss_mlp": 1.0043956, + "epoch": 0.523537545842602, + "flos": 18551740285440.0, + "grad_norm": 1.7438933445932676, + "language_loss": 0.71738815, + "learning_rate": 1.944700061791894e-06, + "loss": 0.73894465, + "num_input_tokens_seen": 94024825, + "step": 4354, + "time_per_iteration": 2.770888328552246 + }, + { + "auxiliary_loss_clip": 0.01131829, + "auxiliary_loss_mlp": 0.01085231, + "balance_loss_clip": 1.02956021, + "balance_loss_mlp": 1.00464523, + "epoch": 0.5236577887332411, + "flos": 19719267955200.0, + "grad_norm": 2.2606155977556943, + "language_loss": 0.64927524, + "learning_rate": 1.943921392185698e-06, + "loss": 0.67144585, + "num_input_tokens_seen": 94043450, + "step": 4355, + "time_per_iteration": 2.7410531044006348 + }, + { + "auxiliary_loss_clip": 0.01108702, + "auxiliary_loss_mlp": 0.01084884, + "balance_loss_clip": 1.02905536, + "balance_loss_mlp": 1.0044415, + "epoch": 0.5237780316238803, + "flos": 23550218121600.0, + "grad_norm": 1.806560564939762, + "language_loss": 0.76821017, + "learning_rate": 1.9431427310865814e-06, + "loss": 0.79014599, + "num_input_tokens_seen": 94063055, + "step": 4356, + "time_per_iteration": 2.7220826148986816 + }, + { + "auxiliary_loss_clip": 0.01107639, + "auxiliary_loss_mlp": 0.01085541, + "balance_loss_clip": 1.02365315, + "balance_loss_mlp": 1.00514603, + "epoch": 0.5238982745145193, + "flos": 22491894775680.0, + "grad_norm": 2.1731922866640434, + "language_loss": 0.79001749, + "learning_rate": 1.942364078612667e-06, + "loss": 0.81194931, + "num_input_tokens_seen": 94081785, + "step": 4357, + "time_per_iteration": 2.7568957805633545 + }, + { + "auxiliary_loss_clip": 0.01097509, + "auxiliary_loss_mlp": 0.01084703, + "balance_loss_clip": 1.0272615, + "balance_loss_mlp": 1.00416481, + "epoch": 0.5240185174051584, + "flos": 27088927234560.0, + "grad_norm": 1.8395594399536788, + "language_loss": 0.75517058, + "learning_rate": 1.9415854348820765e-06, + "loss": 0.77699268, + "num_input_tokens_seen": 94101635, + "step": 4358, + "time_per_iteration": 2.8177173137664795 + }, + { + "auxiliary_loss_clip": 0.01116187, + "auxiliary_loss_mlp": 0.01085615, + "balance_loss_clip": 1.02851045, + "balance_loss_mlp": 1.00502944, + "epoch": 0.5241387602957975, + "flos": 22674680110080.0, + "grad_norm": 1.9941305457670082, + "language_loss": 0.68354946, + "learning_rate": 1.940806800012929e-06, + "loss": 0.70556748, + "num_input_tokens_seen": 94121705, + "step": 4359, + "time_per_iteration": 2.662717580795288 + }, + { + "auxiliary_loss_clip": 0.01094367, + "auxiliary_loss_mlp": 0.00873057, + "balance_loss_clip": 1.0255208, + "balance_loss_mlp": 1.00007868, + "epoch": 0.5242590031864366, + "flos": 40553453134080.0, + "grad_norm": 1.7219438923345394, + "language_loss": 0.63396269, + "learning_rate": 1.9400281741233432e-06, + "loss": 0.65363693, + "num_input_tokens_seen": 94146595, + "step": 4360, + "time_per_iteration": 2.997114896774292 + }, + { + "auxiliary_loss_clip": 0.01087417, + "auxiliary_loss_mlp": 0.01079786, + "balance_loss_clip": 1.02483797, + "balance_loss_mlp": 1.00044084, + "epoch": 0.5243792460770756, + "flos": 66676313105280.0, + "grad_norm": 0.6791228186182076, + "language_loss": 0.52607024, + "learning_rate": 1.939249557331435e-06, + "loss": 0.54774225, + "num_input_tokens_seen": 94212410, + "step": 4361, + "time_per_iteration": 4.314127206802368 + }, + { + "auxiliary_loss_clip": 0.01114974, + "auxiliary_loss_mlp": 0.01084672, + "balance_loss_clip": 1.02771306, + "balance_loss_mlp": 1.00427711, + "epoch": 0.5244994889677148, + "flos": 28183663992960.0, + "grad_norm": 2.5801536102370917, + "language_loss": 0.729186, + "learning_rate": 1.938470949755321e-06, + "loss": 0.7511825, + "num_input_tokens_seen": 94232290, + "step": 4362, + "time_per_iteration": 3.686156749725342 + }, + { + "auxiliary_loss_clip": 0.01097517, + "auxiliary_loss_mlp": 0.01079396, + "balance_loss_clip": 1.02546263, + "balance_loss_mlp": 1.00043225, + "epoch": 0.5246197318583539, + "flos": 65950379239680.0, + "grad_norm": 0.8059822857584193, + "language_loss": 0.55710626, + "learning_rate": 1.937692351513115e-06, + "loss": 0.57887536, + "num_input_tokens_seen": 94291285, + "step": 4363, + "time_per_iteration": 3.2422573566436768 + }, + { + "auxiliary_loss_clip": 0.01132416, + "auxiliary_loss_mlp": 0.01084436, + "balance_loss_clip": 1.02849627, + "balance_loss_mlp": 1.00385106, + "epoch": 0.5247399747489929, + "flos": 21033490769280.0, + "grad_norm": 1.7598016505563172, + "language_loss": 0.8047123, + "learning_rate": 1.9369137627229297e-06, + "loss": 0.82688081, + "num_input_tokens_seen": 94309685, + "step": 4364, + "time_per_iteration": 3.6603806018829346 + }, + { + "auxiliary_loss_clip": 0.01130611, + "auxiliary_loss_mlp": 0.01085023, + "balance_loss_clip": 1.02919507, + "balance_loss_mlp": 1.00453281, + "epoch": 0.5248602176396321, + "flos": 19025940660480.0, + "grad_norm": 1.9176933502545102, + "language_loss": 0.88468587, + "learning_rate": 1.936135183502877e-06, + "loss": 0.90684223, + "num_input_tokens_seen": 94326985, + "step": 4365, + "time_per_iteration": 2.6444215774536133 + }, + { + "auxiliary_loss_clip": 0.0109865, + "auxiliary_loss_mlp": 0.01086673, + "balance_loss_clip": 1.0291183, + "balance_loss_mlp": 1.00608766, + "epoch": 0.5249804605302711, + "flos": 22200084685440.0, + "grad_norm": 1.9247574267412246, + "language_loss": 0.80241144, + "learning_rate": 1.935356613971066e-06, + "loss": 0.82426471, + "num_input_tokens_seen": 94347645, + "step": 4366, + "time_per_iteration": 2.831367015838623 + }, + { + "auxiliary_loss_clip": 0.01121458, + "auxiliary_loss_mlp": 0.00873017, + "balance_loss_clip": 1.02780092, + "balance_loss_mlp": 1.00006151, + "epoch": 0.5251007034209102, + "flos": 23805686626560.0, + "grad_norm": 1.6408626198326062, + "language_loss": 0.76660407, + "learning_rate": 1.9345780542456047e-06, + "loss": 0.78654879, + "num_input_tokens_seen": 94367020, + "step": 4367, + "time_per_iteration": 3.7153310775756836 + }, + { + "auxiliary_loss_clip": 0.01132291, + "auxiliary_loss_mlp": 0.01084667, + "balance_loss_clip": 1.02914691, + "balance_loss_mlp": 1.0042727, + "epoch": 0.5252209463115494, + "flos": 23294605962240.0, + "grad_norm": 1.7984965133791395, + "language_loss": 0.71815085, + "learning_rate": 1.9337995044446007e-06, + "loss": 0.74032044, + "num_input_tokens_seen": 94385860, + "step": 4368, + "time_per_iteration": 2.649479866027832 + }, + { + "auxiliary_loss_clip": 0.01132485, + "auxiliary_loss_mlp": 0.01085352, + "balance_loss_clip": 1.02888978, + "balance_loss_mlp": 1.00481439, + "epoch": 0.5253411892021884, + "flos": 19828687760640.0, + "grad_norm": 1.9670790217592766, + "language_loss": 0.8028633, + "learning_rate": 1.9330209646861596e-06, + "loss": 0.82504165, + "num_input_tokens_seen": 94405010, + "step": 4369, + "time_per_iteration": 2.7231311798095703 + }, + { + "auxiliary_loss_clip": 0.01113317, + "auxiliary_loss_mlp": 0.01085314, + "balance_loss_clip": 1.02535021, + "balance_loss_mlp": 1.00487161, + "epoch": 0.5254614320928275, + "flos": 24133730561280.0, + "grad_norm": 1.816396152308431, + "language_loss": 0.77925068, + "learning_rate": 1.9322424350883843e-06, + "loss": 0.80123699, + "num_input_tokens_seen": 94426845, + "step": 4370, + "time_per_iteration": 2.7466647624969482 + }, + { + "auxiliary_loss_clip": 0.01123312, + "auxiliary_loss_mlp": 0.01083912, + "balance_loss_clip": 1.02837312, + "balance_loss_mlp": 1.0035646, + "epoch": 0.5255816749834666, + "flos": 24644954880000.0, + "grad_norm": 1.5818321448962556, + "language_loss": 0.7882297, + "learning_rate": 1.931463915769379e-06, + "loss": 0.8103019, + "num_input_tokens_seen": 94446960, + "step": 4371, + "time_per_iteration": 2.7563910484313965 + }, + { + "auxiliary_loss_clip": 0.01104806, + "auxiliary_loss_mlp": 0.0108567, + "balance_loss_clip": 1.02808177, + "balance_loss_mlp": 1.00517964, + "epoch": 0.5257019178741057, + "flos": 14136595320960.0, + "grad_norm": 2.13012217906687, + "language_loss": 0.7442807, + "learning_rate": 1.930685406847242e-06, + "loss": 0.7661854, + "num_input_tokens_seen": 94461535, + "step": 4372, + "time_per_iteration": 2.792330503463745 + }, + { + "auxiliary_loss_clip": 0.0112092, + "auxiliary_loss_mlp": 0.01085271, + "balance_loss_clip": 1.02772939, + "balance_loss_mlp": 1.00482893, + "epoch": 0.5258221607647448, + "flos": 23548961145600.0, + "grad_norm": 1.949150156501406, + "language_loss": 0.81780863, + "learning_rate": 1.9299069084400734e-06, + "loss": 0.83987045, + "num_input_tokens_seen": 94482395, + "step": 4373, + "time_per_iteration": 2.795758008956909 + }, + { + "auxiliary_loss_clip": 0.01110178, + "auxiliary_loss_mlp": 0.01084953, + "balance_loss_clip": 1.02710629, + "balance_loss_mlp": 1.00441527, + "epoch": 0.5259424036553839, + "flos": 24966103403520.0, + "grad_norm": 3.2513651603031564, + "language_loss": 0.69991183, + "learning_rate": 1.9291284206659717e-06, + "loss": 0.72186315, + "num_input_tokens_seen": 94500580, + "step": 4374, + "time_per_iteration": 2.800358295440674 + }, + { + "auxiliary_loss_clip": 0.01140841, + "auxiliary_loss_mlp": 0.010855, + "balance_loss_clip": 1.0294807, + "balance_loss_mlp": 1.00496209, + "epoch": 0.526062646546023, + "flos": 28763908295040.0, + "grad_norm": 1.779350234320747, + "language_loss": 0.71699703, + "learning_rate": 1.928349943643032e-06, + "loss": 0.73926044, + "num_input_tokens_seen": 94519680, + "step": 4375, + "time_per_iteration": 2.6769959926605225 + }, + { + "auxiliary_loss_clip": 0.0112539, + "auxiliary_loss_mlp": 0.01085445, + "balance_loss_clip": 1.02889311, + "balance_loss_mlp": 1.00500214, + "epoch": 0.526182889436662, + "flos": 22821375254400.0, + "grad_norm": 1.710060129742393, + "language_loss": 0.81914687, + "learning_rate": 1.9275714774893493e-06, + "loss": 0.84125525, + "num_input_tokens_seen": 94539135, + "step": 4376, + "time_per_iteration": 2.6078341007232666 + }, + { + "auxiliary_loss_clip": 0.01106428, + "auxiliary_loss_mlp": 0.01085, + "balance_loss_clip": 1.02616632, + "balance_loss_mlp": 1.004462, + "epoch": 0.5263031323273012, + "flos": 22929466256640.0, + "grad_norm": 1.985388369877525, + "language_loss": 0.72829765, + "learning_rate": 1.9267930223230154e-06, + "loss": 0.75021195, + "num_input_tokens_seen": 94557610, + "step": 4377, + "time_per_iteration": 2.718646764755249 + }, + { + "auxiliary_loss_clip": 0.01121843, + "auxiliary_loss_mlp": 0.01085139, + "balance_loss_clip": 1.02753592, + "balance_loss_mlp": 1.00464892, + "epoch": 0.5264233752179402, + "flos": 17748634049280.0, + "grad_norm": 1.968414687878642, + "language_loss": 0.78098875, + "learning_rate": 1.9260145782621224e-06, + "loss": 0.80305856, + "num_input_tokens_seen": 94575390, + "step": 4378, + "time_per_iteration": 2.705533027648926 + }, + { + "auxiliary_loss_clip": 0.01119998, + "auxiliary_loss_mlp": 0.01084228, + "balance_loss_clip": 1.02749276, + "balance_loss_mlp": 1.00378561, + "epoch": 0.5265436181085793, + "flos": 24421626069120.0, + "grad_norm": 1.7637581391942947, + "language_loss": 0.87779903, + "learning_rate": 1.925236145424758e-06, + "loss": 0.89984131, + "num_input_tokens_seen": 94594210, + "step": 4379, + "time_per_iteration": 2.6949424743652344 + }, + { + "auxiliary_loss_clip": 0.01112895, + "auxiliary_loss_mlp": 0.01079323, + "balance_loss_clip": 1.02440405, + "balance_loss_mlp": 1.00035906, + "epoch": 0.5266638609992185, + "flos": 69207298156800.0, + "grad_norm": 0.7025216447965927, + "language_loss": 0.57590973, + "learning_rate": 1.924457723929012e-06, + "loss": 0.59783196, + "num_input_tokens_seen": 94665020, + "step": 4380, + "time_per_iteration": 3.3620517253875732 + }, + { + "auxiliary_loss_clip": 0.01131567, + "auxiliary_loss_mlp": 0.01084622, + "balance_loss_clip": 1.02904034, + "balance_loss_mlp": 1.00422716, + "epoch": 0.5267841038898575, + "flos": 20738699850240.0, + "grad_norm": 1.911768063587066, + "language_loss": 0.83045596, + "learning_rate": 1.9236793138929685e-06, + "loss": 0.85261786, + "num_input_tokens_seen": 94684290, + "step": 4381, + "time_per_iteration": 2.716576337814331 + }, + { + "auxiliary_loss_clip": 0.01133169, + "auxiliary_loss_mlp": 0.0108719, + "balance_loss_clip": 1.02897489, + "balance_loss_mlp": 1.00665188, + "epoch": 0.5269043467804966, + "flos": 17234392988160.0, + "grad_norm": 3.818285587418566, + "language_loss": 0.81347883, + "learning_rate": 1.9229009154347133e-06, + "loss": 0.83568245, + "num_input_tokens_seen": 94701880, + "step": 4382, + "time_per_iteration": 2.7136573791503906 + }, + { + "auxiliary_loss_clip": 0.01102703, + "auxiliary_loss_mlp": 0.00872933, + "balance_loss_clip": 1.02579892, + "balance_loss_mlp": 1.0000689, + "epoch": 0.5270245896711357, + "flos": 18223157646720.0, + "grad_norm": 2.0090855046318508, + "language_loss": 0.80725896, + "learning_rate": 1.922122528672327e-06, + "loss": 0.82701528, + "num_input_tokens_seen": 94720545, + "step": 4383, + "time_per_iteration": 2.748105049133301 + }, + { + "auxiliary_loss_clip": 0.01140295, + "auxiliary_loss_mlp": 0.01085161, + "balance_loss_clip": 1.02887928, + "balance_loss_mlp": 1.00481367, + "epoch": 0.5271448325617748, + "flos": 21287558643840.0, + "grad_norm": 2.1592314644822723, + "language_loss": 0.78743124, + "learning_rate": 1.9213441537238914e-06, + "loss": 0.80968583, + "num_input_tokens_seen": 94737420, + "step": 4384, + "time_per_iteration": 2.639341354370117 + }, + { + "auxiliary_loss_clip": 0.01076523, + "auxiliary_loss_mlp": 0.0107928, + "balance_loss_clip": 1.02196717, + "balance_loss_mlp": 0.99993479, + "epoch": 0.5272650754524139, + "flos": 65495497403520.0, + "grad_norm": 0.8259857784981727, + "language_loss": 0.57336563, + "learning_rate": 1.920565790707485e-06, + "loss": 0.59492368, + "num_input_tokens_seen": 94802810, + "step": 4385, + "time_per_iteration": 3.515089988708496 + }, + { + "auxiliary_loss_clip": 0.01091044, + "auxiliary_loss_mlp": 0.01086418, + "balance_loss_clip": 1.02905965, + "balance_loss_mlp": 1.00578499, + "epoch": 0.527385318343053, + "flos": 19676426008320.0, + "grad_norm": 3.847530237922913, + "language_loss": 0.66114998, + "learning_rate": 1.9197874397411853e-06, + "loss": 0.68292457, + "num_input_tokens_seen": 94819440, + "step": 4386, + "time_per_iteration": 3.5944762229919434 + }, + { + "auxiliary_loss_clip": 0.01115238, + "auxiliary_loss_mlp": 0.01084015, + "balance_loss_clip": 1.02743423, + "balance_loss_mlp": 1.00362039, + "epoch": 0.5275055612336921, + "flos": 12712018947840.0, + "grad_norm": 3.1556247961904926, + "language_loss": 0.67313254, + "learning_rate": 1.919009100943067e-06, + "loss": 0.69512504, + "num_input_tokens_seen": 94835130, + "step": 4387, + "time_per_iteration": 3.683736562728882 + }, + { + "auxiliary_loss_clip": 0.01104456, + "auxiliary_loss_mlp": 0.01084332, + "balance_loss_clip": 1.02603269, + "balance_loss_mlp": 1.003842, + "epoch": 0.5276258041243311, + "flos": 17749029098880.0, + "grad_norm": 2.28845465648447, + "language_loss": 0.65813482, + "learning_rate": 1.9182307744312043e-06, + "loss": 0.68002272, + "num_input_tokens_seen": 94852235, + "step": 4388, + "time_per_iteration": 2.8412554264068604 + }, + { + "auxiliary_loss_clip": 0.01122797, + "auxiliary_loss_mlp": 0.01085575, + "balance_loss_clip": 1.02768588, + "balance_loss_mlp": 1.00508475, + "epoch": 0.5277460470149702, + "flos": 22710447077760.0, + "grad_norm": 1.9699374580155808, + "language_loss": 0.76115912, + "learning_rate": 1.9174524603236676e-06, + "loss": 0.78324282, + "num_input_tokens_seen": 94871185, + "step": 4389, + "time_per_iteration": 2.7399680614471436 + }, + { + "auxiliary_loss_clip": 0.01117843, + "auxiliary_loss_mlp": 0.01085153, + "balance_loss_clip": 1.02535486, + "balance_loss_mlp": 1.00456786, + "epoch": 0.5278662899056094, + "flos": 19902699734400.0, + "grad_norm": 3.1388679641818724, + "language_loss": 0.76492298, + "learning_rate": 1.916674158738527e-06, + "loss": 0.78695297, + "num_input_tokens_seen": 94890090, + "step": 4390, + "time_per_iteration": 3.5822250843048096 + }, + { + "auxiliary_loss_clip": 0.01103565, + "auxiliary_loss_mlp": 0.00872997, + "balance_loss_clip": 1.02590656, + "balance_loss_mlp": 1.00006664, + "epoch": 0.5279865327962484, + "flos": 18005215875840.0, + "grad_norm": 1.8405040920497038, + "language_loss": 0.60023582, + "learning_rate": 1.9158958697938506e-06, + "loss": 0.62000144, + "num_input_tokens_seen": 94908470, + "step": 4391, + "time_per_iteration": 2.8083431720733643 + }, + { + "auxiliary_loss_clip": 0.01122462, + "auxiliary_loss_mlp": 0.01084712, + "balance_loss_clip": 1.02785277, + "balance_loss_mlp": 1.00417399, + "epoch": 0.5281067756868875, + "flos": 15924443892480.0, + "grad_norm": 2.1686753628951267, + "language_loss": 0.8564322, + "learning_rate": 1.9151175936077032e-06, + "loss": 0.87850392, + "num_input_tokens_seen": 94923440, + "step": 4392, + "time_per_iteration": 2.7003042697906494 + }, + { + "auxiliary_loss_clip": 0.01129362, + "auxiliary_loss_mlp": 0.01085481, + "balance_loss_clip": 1.02724254, + "balance_loss_mlp": 1.00508618, + "epoch": 0.5282270185775266, + "flos": 19426488197760.0, + "grad_norm": 1.508465168192232, + "language_loss": 0.79247934, + "learning_rate": 1.9143393302981507e-06, + "loss": 0.81462777, + "num_input_tokens_seen": 94941125, + "step": 4393, + "time_per_iteration": 3.6243736743927 + }, + { + "auxiliary_loss_clip": 0.01123802, + "auxiliary_loss_mlp": 0.01085595, + "balance_loss_clip": 1.02890825, + "balance_loss_mlp": 1.00520015, + "epoch": 0.5283472614681657, + "flos": 16399613934720.0, + "grad_norm": 1.6398307446682432, + "language_loss": 0.83238351, + "learning_rate": 1.913561079983252e-06, + "loss": 0.85447747, + "num_input_tokens_seen": 94959950, + "step": 4394, + "time_per_iteration": 2.6967012882232666 + }, + { + "auxiliary_loss_clip": 0.01109195, + "auxiliary_loss_mlp": 0.0108636, + "balance_loss_clip": 1.02906787, + "balance_loss_mlp": 1.00572693, + "epoch": 0.5284675043588047, + "flos": 26760524163840.0, + "grad_norm": 2.039697963942752, + "language_loss": 0.7452392, + "learning_rate": 1.9127828427810693e-06, + "loss": 0.76719475, + "num_input_tokens_seen": 94980515, + "step": 4395, + "time_per_iteration": 2.773974657058716 + }, + { + "auxiliary_loss_clip": 0.01113466, + "auxiliary_loss_mlp": 0.01084605, + "balance_loss_clip": 1.02677608, + "balance_loss_mlp": 1.00411499, + "epoch": 0.5285877472494439, + "flos": 19899898473600.0, + "grad_norm": 2.054590220830768, + "language_loss": 0.81137383, + "learning_rate": 1.9120046188096607e-06, + "loss": 0.83335453, + "num_input_tokens_seen": 94998560, + "step": 4396, + "time_per_iteration": 2.7329392433166504 + }, + { + "auxiliary_loss_clip": 0.01115644, + "auxiliary_loss_mlp": 0.01085322, + "balance_loss_clip": 1.02379918, + "balance_loss_mlp": 1.00497532, + "epoch": 0.528707990140083, + "flos": 20011257613440.0, + "grad_norm": 1.794757981288435, + "language_loss": 0.74287522, + "learning_rate": 1.9112264081870804e-06, + "loss": 0.76488483, + "num_input_tokens_seen": 95016950, + "step": 4397, + "time_per_iteration": 2.7899820804595947 + }, + { + "auxiliary_loss_clip": 0.01102931, + "auxiliary_loss_mlp": 0.01085433, + "balance_loss_clip": 1.02084255, + "balance_loss_mlp": 1.00489533, + "epoch": 0.528828233030722, + "flos": 20667956014080.0, + "grad_norm": 1.9115160503842639, + "language_loss": 0.75598168, + "learning_rate": 1.9104482110313843e-06, + "loss": 0.77786529, + "num_input_tokens_seen": 95036540, + "step": 4398, + "time_per_iteration": 2.753282308578491 + }, + { + "auxiliary_loss_clip": 0.01129109, + "auxiliary_loss_mlp": 0.01083984, + "balance_loss_clip": 1.02708745, + "balance_loss_mlp": 1.00358891, + "epoch": 0.5289484759213612, + "flos": 25192448956800.0, + "grad_norm": 1.7523570216917146, + "language_loss": 0.74247932, + "learning_rate": 1.909670027460623e-06, + "loss": 0.76461029, + "num_input_tokens_seen": 95053840, + "step": 4399, + "time_per_iteration": 2.671255111694336 + }, + { + "auxiliary_loss_clip": 0.01131662, + "auxiliary_loss_mlp": 0.01084089, + "balance_loss_clip": 1.0292275, + "balance_loss_mlp": 1.00364637, + "epoch": 0.5290687188120002, + "flos": 31139255715840.0, + "grad_norm": 1.9143233229675594, + "language_loss": 0.71393239, + "learning_rate": 1.908891857592847e-06, + "loss": 0.73608983, + "num_input_tokens_seen": 95074910, + "step": 4400, + "time_per_iteration": 2.778621196746826 + }, + { + "auxiliary_loss_clip": 0.01104627, + "auxiliary_loss_mlp": 0.01083707, + "balance_loss_clip": 1.02288806, + "balance_loss_mlp": 1.00331211, + "epoch": 0.5291889617026393, + "flos": 20119851406080.0, + "grad_norm": 2.630652506886539, + "language_loss": 0.89922059, + "learning_rate": 1.9081137015461034e-06, + "loss": 0.92110389, + "num_input_tokens_seen": 95090985, + "step": 4401, + "time_per_iteration": 2.718628406524658 + }, + { + "auxiliary_loss_clip": 0.01101191, + "auxiliary_loss_mlp": 0.01085431, + "balance_loss_clip": 1.02654004, + "balance_loss_mlp": 1.00498819, + "epoch": 0.5293092045932785, + "flos": 19643747610240.0, + "grad_norm": 1.7797811318722836, + "language_loss": 0.90704215, + "learning_rate": 1.9073355594384383e-06, + "loss": 0.92890835, + "num_input_tokens_seen": 95109225, + "step": 4402, + "time_per_iteration": 2.9091286659240723 + }, + { + "auxiliary_loss_clip": 0.01107407, + "auxiliary_loss_mlp": 0.0108433, + "balance_loss_clip": 1.0231185, + "balance_loss_mlp": 1.00388765, + "epoch": 0.5294294474839175, + "flos": 24317736958080.0, + "grad_norm": 1.7929705768593238, + "language_loss": 0.80482316, + "learning_rate": 1.906557431387895e-06, + "loss": 0.8267405, + "num_input_tokens_seen": 95128215, + "step": 4403, + "time_per_iteration": 2.7928295135498047 + }, + { + "auxiliary_loss_clip": 0.0110423, + "auxiliary_loss_mlp": 0.01086519, + "balance_loss_clip": 1.02268875, + "balance_loss_mlp": 1.00598085, + "epoch": 0.5295496903745566, + "flos": 18875941464960.0, + "grad_norm": 1.9128236713727416, + "language_loss": 0.78930223, + "learning_rate": 1.905779317512516e-06, + "loss": 0.81120974, + "num_input_tokens_seen": 95145760, + "step": 4404, + "time_per_iteration": 2.7210512161254883 + }, + { + "auxiliary_loss_clip": 0.0113273, + "auxiliary_loss_mlp": 0.01084903, + "balance_loss_clip": 1.02997506, + "balance_loss_mlp": 1.00450873, + "epoch": 0.5296699332651957, + "flos": 20923101296640.0, + "grad_norm": 1.8058843613551872, + "language_loss": 0.80683732, + "learning_rate": 1.9050012179303385e-06, + "loss": 0.82901371, + "num_input_tokens_seen": 95164270, + "step": 4405, + "time_per_iteration": 2.630253314971924 + }, + { + "auxiliary_loss_clip": 0.01131539, + "auxiliary_loss_mlp": 0.01083842, + "balance_loss_clip": 1.0272975, + "balance_loss_mlp": 1.0033524, + "epoch": 0.5297901761558348, + "flos": 22046745525120.0, + "grad_norm": 2.1839883841900107, + "language_loss": 0.68797541, + "learning_rate": 1.904223132759401e-06, + "loss": 0.71012926, + "num_input_tokens_seen": 95182870, + "step": 4406, + "time_per_iteration": 2.6939280033111572 + }, + { + "auxiliary_loss_clip": 0.0113013, + "auxiliary_loss_mlp": 0.01086123, + "balance_loss_clip": 1.02744639, + "balance_loss_mlp": 1.00558472, + "epoch": 0.5299104190464738, + "flos": 21798495653760.0, + "grad_norm": 2.966987559669377, + "language_loss": 0.68660152, + "learning_rate": 1.9034450621177383e-06, + "loss": 0.70876408, + "num_input_tokens_seen": 95201190, + "step": 4407, + "time_per_iteration": 2.6550073623657227 + }, + { + "auxiliary_loss_clip": 0.01130913, + "auxiliary_loss_mlp": 0.01085133, + "balance_loss_clip": 1.02944887, + "balance_loss_mlp": 1.00469041, + "epoch": 0.530030661937113, + "flos": 14720790119040.0, + "grad_norm": 2.185395552039706, + "language_loss": 0.70378894, + "learning_rate": 1.9026670061233824e-06, + "loss": 0.72594941, + "num_input_tokens_seen": 95218625, + "step": 4408, + "time_per_iteration": 2.6864993572235107 + }, + { + "auxiliary_loss_clip": 0.01113152, + "auxiliary_loss_mlp": 0.01085256, + "balance_loss_clip": 1.02608633, + "balance_loss_mlp": 1.00490856, + "epoch": 0.5301509048277521, + "flos": 21251504367360.0, + "grad_norm": 1.5481979900089362, + "language_loss": 0.80597872, + "learning_rate": 1.901888964894365e-06, + "loss": 0.82796282, + "num_input_tokens_seen": 95237665, + "step": 4409, + "time_per_iteration": 2.722506284713745 + }, + { + "auxiliary_loss_clip": 0.01140791, + "auxiliary_loss_mlp": 0.0108443, + "balance_loss_clip": 1.02877092, + "balance_loss_mlp": 1.00393987, + "epoch": 0.5302711477183911, + "flos": 25957058791680.0, + "grad_norm": 1.8781809054039522, + "language_loss": 0.67486989, + "learning_rate": 1.9011109385487134e-06, + "loss": 0.6971221, + "num_input_tokens_seen": 95258915, + "step": 4410, + "time_per_iteration": 2.6617865562438965 + }, + { + "auxiliary_loss_clip": 0.01140758, + "auxiliary_loss_mlp": 0.01085828, + "balance_loss_clip": 1.02901912, + "balance_loss_mlp": 1.00529003, + "epoch": 0.5303913906090303, + "flos": 22273126992000.0, + "grad_norm": 6.45661214131699, + "language_loss": 0.66221106, + "learning_rate": 1.900332927204454e-06, + "loss": 0.68447697, + "num_input_tokens_seen": 95277365, + "step": 4411, + "time_per_iteration": 2.6191580295562744 + }, + { + "auxiliary_loss_clip": 0.01106288, + "auxiliary_loss_mlp": 0.01084714, + "balance_loss_clip": 1.02747989, + "balance_loss_mlp": 1.00417662, + "epoch": 0.5305116334996693, + "flos": 24936010784640.0, + "grad_norm": 1.7308177534203542, + "language_loss": 0.76401794, + "learning_rate": 1.8995549309796097e-06, + "loss": 0.78592789, + "num_input_tokens_seen": 95296670, + "step": 4412, + "time_per_iteration": 4.551636219024658 + }, + { + "auxiliary_loss_clip": 0.01117329, + "auxiliary_loss_mlp": 0.0108484, + "balance_loss_clip": 1.03034317, + "balance_loss_mlp": 1.00430226, + "epoch": 0.5306318763903084, + "flos": 20189338266240.0, + "grad_norm": 1.8003479744323234, + "language_loss": 0.76422095, + "learning_rate": 1.8987769499922028e-06, + "loss": 0.78624266, + "num_input_tokens_seen": 95315640, + "step": 4413, + "time_per_iteration": 2.635221242904663 + }, + { + "auxiliary_loss_clip": 0.01129031, + "auxiliary_loss_mlp": 0.00873061, + "balance_loss_clip": 1.02749228, + "balance_loss_mlp": 1.00005627, + "epoch": 0.5307521192809476, + "flos": 20266366982400.0, + "grad_norm": 2.165203890601147, + "language_loss": 0.70676112, + "learning_rate": 1.897998984360252e-06, + "loss": 0.72678202, + "num_input_tokens_seen": 95334610, + "step": 4414, + "time_per_iteration": 2.7589187622070312 + }, + { + "auxiliary_loss_clip": 0.01121057, + "auxiliary_loss_mlp": 0.01085033, + "balance_loss_clip": 1.02737284, + "balance_loss_mlp": 1.00463784, + "epoch": 0.5308723621715866, + "flos": 28844276976000.0, + "grad_norm": 1.4829274773336127, + "language_loss": 0.78556478, + "learning_rate": 1.897221034201775e-06, + "loss": 0.80762571, + "num_input_tokens_seen": 95358350, + "step": 4415, + "time_per_iteration": 3.770559549331665 + }, + { + "auxiliary_loss_clip": 0.01095645, + "auxiliary_loss_mlp": 0.01084897, + "balance_loss_clip": 1.02665854, + "balance_loss_mlp": 1.00445426, + "epoch": 0.5309926050622257, + "flos": 27457766040960.0, + "grad_norm": 1.4519264086098893, + "language_loss": 0.66863579, + "learning_rate": 1.8964430996347842e-06, + "loss": 0.69044125, + "num_input_tokens_seen": 95379900, + "step": 4416, + "time_per_iteration": 2.850196123123169 + }, + { + "auxiliary_loss_clip": 0.01121582, + "auxiliary_loss_mlp": 0.01085833, + "balance_loss_clip": 1.02744818, + "balance_loss_mlp": 1.00519967, + "epoch": 0.5311128479528648, + "flos": 20514545026560.0, + "grad_norm": 1.7298160455182192, + "language_loss": 0.82555246, + "learning_rate": 1.8956651807772931e-06, + "loss": 0.84762663, + "num_input_tokens_seen": 95397935, + "step": 4417, + "time_per_iteration": 2.6978087425231934 + }, + { + "auxiliary_loss_clip": 0.01129021, + "auxiliary_loss_mlp": 0.01084709, + "balance_loss_clip": 1.02770185, + "balance_loss_mlp": 1.00440979, + "epoch": 0.5312330908435039, + "flos": 21397660807680.0, + "grad_norm": 1.7355020547720172, + "language_loss": 0.83525884, + "learning_rate": 1.8948872777473115e-06, + "loss": 0.85739625, + "num_input_tokens_seen": 95415890, + "step": 4418, + "time_per_iteration": 3.621485471725464 + }, + { + "auxiliary_loss_clip": 0.0112056, + "auxiliary_loss_mlp": 0.0108465, + "balance_loss_clip": 1.02723193, + "balance_loss_mlp": 1.00420737, + "epoch": 0.531353333734143, + "flos": 24717350741760.0, + "grad_norm": 1.615281674218468, + "language_loss": 0.63531655, + "learning_rate": 1.8941093906628458e-06, + "loss": 0.65736866, + "num_input_tokens_seen": 95433675, + "step": 4419, + "time_per_iteration": 2.7543821334838867 + }, + { + "auxiliary_loss_clip": 0.01121785, + "auxiliary_loss_mlp": 0.01085153, + "balance_loss_clip": 1.02773738, + "balance_loss_mlp": 1.00466299, + "epoch": 0.531473576624782, + "flos": 30480689808000.0, + "grad_norm": 1.76721850207598, + "language_loss": 0.70817697, + "learning_rate": 1.893331519641902e-06, + "loss": 0.73024631, + "num_input_tokens_seen": 95455820, + "step": 4420, + "time_per_iteration": 2.732477903366089 + }, + { + "auxiliary_loss_clip": 0.01113065, + "auxiliary_loss_mlp": 0.01083657, + "balance_loss_clip": 1.02650619, + "balance_loss_mlp": 1.00321424, + "epoch": 0.5315938195154212, + "flos": 23002975440000.0, + "grad_norm": 2.216227480922434, + "language_loss": 0.73887825, + "learning_rate": 1.8925536648024815e-06, + "loss": 0.76084542, + "num_input_tokens_seen": 95473240, + "step": 4421, + "time_per_iteration": 2.807276964187622 + }, + { + "auxiliary_loss_clip": 0.01141041, + "auxiliary_loss_mlp": 0.01084146, + "balance_loss_clip": 1.02901006, + "balance_loss_mlp": 1.00360847, + "epoch": 0.5317140624060602, + "flos": 22748584343040.0, + "grad_norm": 3.042834939076781, + "language_loss": 0.7581141, + "learning_rate": 1.8917758262625849e-06, + "loss": 0.780366, + "num_input_tokens_seen": 95493480, + "step": 4422, + "time_per_iteration": 2.6244921684265137 + }, + { + "auxiliary_loss_clip": 0.01118829, + "auxiliary_loss_mlp": 0.01084266, + "balance_loss_clip": 1.02682221, + "balance_loss_mlp": 1.0038712, + "epoch": 0.5318343052966993, + "flos": 22821087945600.0, + "grad_norm": 1.6369517361326757, + "language_loss": 0.80925924, + "learning_rate": 1.8909980041402089e-06, + "loss": 0.83129019, + "num_input_tokens_seen": 95512075, + "step": 4423, + "time_per_iteration": 2.7494313716888428 + }, + { + "auxiliary_loss_clip": 0.01132231, + "auxiliary_loss_mlp": 0.01086339, + "balance_loss_clip": 1.02871323, + "balance_loss_mlp": 1.00575376, + "epoch": 0.5319545481873384, + "flos": 13626089274240.0, + "grad_norm": 1.9007696481379817, + "language_loss": 0.65785563, + "learning_rate": 1.8902201985533494e-06, + "loss": 0.68004137, + "num_input_tokens_seen": 95529340, + "step": 4424, + "time_per_iteration": 2.6679534912109375 + }, + { + "auxiliary_loss_clip": 0.0111904, + "auxiliary_loss_mlp": 0.01085267, + "balance_loss_clip": 1.02596819, + "balance_loss_mlp": 1.00482488, + "epoch": 0.5320747910779775, + "flos": 22162522037760.0, + "grad_norm": 1.7170217021325054, + "language_loss": 0.7488789, + "learning_rate": 1.8894424096199983e-06, + "loss": 0.77092195, + "num_input_tokens_seen": 95548545, + "step": 4425, + "time_per_iteration": 2.7283947467803955 + }, + { + "auxiliary_loss_clip": 0.01130412, + "auxiliary_loss_mlp": 0.01086103, + "balance_loss_clip": 1.02814901, + "balance_loss_mlp": 1.00556564, + "epoch": 0.5321950339686166, + "flos": 18588081870720.0, + "grad_norm": 1.872025967226416, + "language_loss": 0.85345697, + "learning_rate": 1.8886646374581463e-06, + "loss": 0.87562203, + "num_input_tokens_seen": 95567770, + "step": 4426, + "time_per_iteration": 2.652766227722168 + }, + { + "auxiliary_loss_clip": 0.01130489, + "auxiliary_loss_mlp": 0.01086196, + "balance_loss_clip": 1.02729249, + "balance_loss_mlp": 1.00570583, + "epoch": 0.5323152768592557, + "flos": 22856818999680.0, + "grad_norm": 1.6133855552424676, + "language_loss": 0.71216583, + "learning_rate": 1.8878868821857795e-06, + "loss": 0.73433268, + "num_input_tokens_seen": 95587420, + "step": 4427, + "time_per_iteration": 2.713310480117798 + }, + { + "auxiliary_loss_clip": 0.01098788, + "auxiliary_loss_mlp": 0.0108499, + "balance_loss_clip": 1.02239299, + "balance_loss_mlp": 1.00445223, + "epoch": 0.5324355197498948, + "flos": 33948690998400.0, + "grad_norm": 1.90886914277294, + "language_loss": 0.75189167, + "learning_rate": 1.8871091439208838e-06, + "loss": 0.77372944, + "num_input_tokens_seen": 95609030, + "step": 4428, + "time_per_iteration": 2.9442765712738037 + }, + { + "auxiliary_loss_clip": 0.01104272, + "auxiliary_loss_mlp": 0.01084813, + "balance_loss_clip": 1.02745652, + "balance_loss_mlp": 1.0042274, + "epoch": 0.5325557626405338, + "flos": 23256720092160.0, + "grad_norm": 1.9863402435559296, + "language_loss": 0.77046084, + "learning_rate": 1.8863314227814414e-06, + "loss": 0.79235172, + "num_input_tokens_seen": 95627340, + "step": 4429, + "time_per_iteration": 2.8363900184631348 + }, + { + "auxiliary_loss_clip": 0.01131566, + "auxiliary_loss_mlp": 0.01084712, + "balance_loss_clip": 1.02814758, + "balance_loss_mlp": 1.00417459, + "epoch": 0.532676005531173, + "flos": 26718687797760.0, + "grad_norm": 2.540997095311027, + "language_loss": 0.48286381, + "learning_rate": 1.8855537188854313e-06, + "loss": 0.50502658, + "num_input_tokens_seen": 95646315, + "step": 4430, + "time_per_iteration": 2.754819393157959 + }, + { + "auxiliary_loss_clip": 0.01131482, + "auxiliary_loss_mlp": 0.01085001, + "balance_loss_clip": 1.02737761, + "balance_loss_mlp": 1.00451088, + "epoch": 0.5327962484218121, + "flos": 17894610921600.0, + "grad_norm": 1.9374165656968998, + "language_loss": 0.77929288, + "learning_rate": 1.8847760323508315e-06, + "loss": 0.80145776, + "num_input_tokens_seen": 95665220, + "step": 4431, + "time_per_iteration": 2.6142489910125732 + }, + { + "auxiliary_loss_clip": 0.01118161, + "auxiliary_loss_mlp": 0.01086182, + "balance_loss_clip": 1.02737904, + "balance_loss_mlp": 1.00569189, + "epoch": 0.5329164913124511, + "flos": 17925385898880.0, + "grad_norm": 1.787409450949194, + "language_loss": 0.75333512, + "learning_rate": 1.883998363295616e-06, + "loss": 0.77537847, + "num_input_tokens_seen": 95682700, + "step": 4432, + "time_per_iteration": 2.7154905796051025 + }, + { + "auxiliary_loss_clip": 0.01104309, + "auxiliary_loss_mlp": 0.01079521, + "balance_loss_clip": 1.02422023, + "balance_loss_mlp": 1.00017536, + "epoch": 0.5330367342030903, + "flos": 57254178781440.0, + "grad_norm": 0.9199443947978244, + "language_loss": 0.62619156, + "learning_rate": 1.8832207118377565e-06, + "loss": 0.64802986, + "num_input_tokens_seen": 95738070, + "step": 4433, + "time_per_iteration": 3.1681275367736816 + }, + { + "auxiliary_loss_clip": 0.01141056, + "auxiliary_loss_mlp": 0.0108494, + "balance_loss_clip": 1.02918649, + "balance_loss_mlp": 1.00454485, + "epoch": 0.5331569770937293, + "flos": 17420518287360.0, + "grad_norm": 1.9858161441903777, + "language_loss": 0.69824064, + "learning_rate": 1.882443078095222e-06, + "loss": 0.72050059, + "num_input_tokens_seen": 95756950, + "step": 4434, + "time_per_iteration": 2.6230101585388184 + }, + { + "auxiliary_loss_clip": 0.01087142, + "auxiliary_loss_mlp": 0.01080241, + "balance_loss_clip": 1.0238266, + "balance_loss_mlp": 1.00089538, + "epoch": 0.5332772199843684, + "flos": 56750783627520.0, + "grad_norm": 0.857109279463928, + "language_loss": 0.66789973, + "learning_rate": 1.8816654621859794e-06, + "loss": 0.68957353, + "num_input_tokens_seen": 95816615, + "step": 4435, + "time_per_iteration": 3.2589304447174072 + }, + { + "auxiliary_loss_clip": 0.01139226, + "auxiliary_loss_mlp": 0.01085503, + "balance_loss_clip": 1.0283345, + "balance_loss_mlp": 1.00501299, + "epoch": 0.5333974628750076, + "flos": 18697753071360.0, + "grad_norm": 2.5482037140136446, + "language_loss": 0.72539908, + "learning_rate": 1.8808878642279915e-06, + "loss": 0.74764639, + "num_input_tokens_seen": 95832020, + "step": 4436, + "time_per_iteration": 2.6347954273223877 + }, + { + "auxiliary_loss_clip": 0.0111554, + "auxiliary_loss_mlp": 0.01084486, + "balance_loss_clip": 1.02757525, + "balance_loss_mlp": 1.00390077, + "epoch": 0.5335177057656466, + "flos": 23805507058560.0, + "grad_norm": 2.225870955845284, + "language_loss": 0.65449655, + "learning_rate": 1.8801102843392209e-06, + "loss": 0.6764968, + "num_input_tokens_seen": 95851425, + "step": 4437, + "time_per_iteration": 4.593932390213013 + }, + { + "auxiliary_loss_clip": 0.01112871, + "auxiliary_loss_mlp": 0.01085565, + "balance_loss_clip": 1.0268805, + "balance_loss_mlp": 1.00502682, + "epoch": 0.5336379486562857, + "flos": 25078683605760.0, + "grad_norm": 1.53485759908224, + "language_loss": 0.85114801, + "learning_rate": 1.8793327226376238e-06, + "loss": 0.87313235, + "num_input_tokens_seen": 95870745, + "step": 4438, + "time_per_iteration": 2.812528371810913 + }, + { + "auxiliary_loss_clip": 0.01107942, + "auxiliary_loss_mlp": 0.01085116, + "balance_loss_clip": 1.02865887, + "balance_loss_mlp": 1.00457799, + "epoch": 0.5337581915469248, + "flos": 21396691140480.0, + "grad_norm": 1.6789180719992842, + "language_loss": 0.80241692, + "learning_rate": 1.8785551792411569e-06, + "loss": 0.8243475, + "num_input_tokens_seen": 95889755, + "step": 4439, + "time_per_iteration": 2.7257096767425537 + }, + { + "auxiliary_loss_clip": 0.01121885, + "auxiliary_loss_mlp": 0.01084097, + "balance_loss_clip": 1.02738833, + "balance_loss_mlp": 1.00370193, + "epoch": 0.5338784344375639, + "flos": 14865905064960.0, + "grad_norm": 1.882965160344828, + "language_loss": 0.82449818, + "learning_rate": 1.8777776542677733e-06, + "loss": 0.84655797, + "num_input_tokens_seen": 95907805, + "step": 4440, + "time_per_iteration": 2.6930224895477295 + }, + { + "auxiliary_loss_clip": 0.01110485, + "auxiliary_loss_mlp": 0.01085319, + "balance_loss_clip": 1.02506256, + "balance_loss_mlp": 1.00478101, + "epoch": 0.5339986773282029, + "flos": 20813501923200.0, + "grad_norm": 1.7256058626599942, + "language_loss": 0.72827411, + "learning_rate": 1.8770001478354216e-06, + "loss": 0.7502321, + "num_input_tokens_seen": 95927480, + "step": 4441, + "time_per_iteration": 3.6790688037872314 + }, + { + "auxiliary_loss_clip": 0.0113293, + "auxiliary_loss_mlp": 0.01085216, + "balance_loss_clip": 1.02970302, + "balance_loss_mlp": 1.00463045, + "epoch": 0.5341189202188421, + "flos": 17969089772160.0, + "grad_norm": 2.0114694072204706, + "language_loss": 0.83807075, + "learning_rate": 1.8762226600620504e-06, + "loss": 0.8602522, + "num_input_tokens_seen": 95946095, + "step": 4442, + "time_per_iteration": 2.684835195541382 + }, + { + "auxiliary_loss_clip": 0.011243, + "auxiliary_loss_mlp": 0.01085342, + "balance_loss_clip": 1.02848041, + "balance_loss_mlp": 1.00485229, + "epoch": 0.5342391631094812, + "flos": 11031866328960.0, + "grad_norm": 4.705703106412853, + "language_loss": 0.59271252, + "learning_rate": 1.8754451910656031e-06, + "loss": 0.61480898, + "num_input_tokens_seen": 95959995, + "step": 4443, + "time_per_iteration": 3.6052699089050293 + }, + { + "auxiliary_loss_clip": 0.01102718, + "auxiliary_loss_mlp": 0.01085836, + "balance_loss_clip": 1.02561665, + "balance_loss_mlp": 1.00534558, + "epoch": 0.5343594060001202, + "flos": 15339135772800.0, + "grad_norm": 2.3719674454018906, + "language_loss": 0.8259818, + "learning_rate": 1.8746677409640212e-06, + "loss": 0.84786737, + "num_input_tokens_seen": 95977095, + "step": 4444, + "time_per_iteration": 2.7580008506774902 + }, + { + "auxiliary_loss_clip": 0.01132162, + "auxiliary_loss_mlp": 0.01086477, + "balance_loss_clip": 1.02908278, + "balance_loss_mlp": 1.0060823, + "epoch": 0.5344796488907594, + "flos": 26900898514560.0, + "grad_norm": 1.928609961496075, + "language_loss": 0.84475946, + "learning_rate": 1.8738903098752432e-06, + "loss": 0.86694586, + "num_input_tokens_seen": 95996225, + "step": 4445, + "time_per_iteration": 2.721181869506836 + }, + { + "auxiliary_loss_clip": 0.01124143, + "auxiliary_loss_mlp": 0.01085224, + "balance_loss_clip": 1.02908802, + "balance_loss_mlp": 1.00482953, + "epoch": 0.5345998917813984, + "flos": 25411216740480.0, + "grad_norm": 1.9669584592652734, + "language_loss": 0.73367643, + "learning_rate": 1.8731128979172052e-06, + "loss": 0.75577015, + "num_input_tokens_seen": 96015425, + "step": 4446, + "time_per_iteration": 2.7244017124176025 + }, + { + "auxiliary_loss_clip": 0.01116437, + "auxiliary_loss_mlp": 0.01084735, + "balance_loss_clip": 1.02464628, + "balance_loss_mlp": 1.00419712, + "epoch": 0.5347201346720375, + "flos": 32853379622400.0, + "grad_norm": 2.286689303269176, + "language_loss": 0.6715982, + "learning_rate": 1.8723355052078394e-06, + "loss": 0.69360989, + "num_input_tokens_seen": 96035460, + "step": 4447, + "time_per_iteration": 3.0639100074768066 + }, + { + "auxiliary_loss_clip": 0.0113275, + "auxiliary_loss_mlp": 0.01086007, + "balance_loss_clip": 1.02876937, + "balance_loss_mlp": 1.00532651, + "epoch": 0.5348403775626767, + "flos": 17967940536960.0, + "grad_norm": 2.5579864265746064, + "language_loss": 0.77444398, + "learning_rate": 1.8715581318650765e-06, + "loss": 0.79663146, + "num_input_tokens_seen": 96054515, + "step": 4448, + "time_per_iteration": 2.63834547996521 + }, + { + "auxiliary_loss_clip": 0.01113868, + "auxiliary_loss_mlp": 0.01084745, + "balance_loss_clip": 1.02790499, + "balance_loss_mlp": 1.00425506, + "epoch": 0.5349606204533157, + "flos": 17603339535360.0, + "grad_norm": 2.6530380674664102, + "language_loss": 0.8153522, + "learning_rate": 1.8707807780068422e-06, + "loss": 0.83733839, + "num_input_tokens_seen": 96072330, + "step": 4449, + "time_per_iteration": 2.7336478233337402 + }, + { + "auxiliary_loss_clip": 0.01113608, + "auxiliary_loss_mlp": 0.01084606, + "balance_loss_clip": 1.02475166, + "balance_loss_mlp": 1.00421119, + "epoch": 0.5350808633439548, + "flos": 29167831710720.0, + "grad_norm": 1.8006719028158469, + "language_loss": 0.65928513, + "learning_rate": 1.8700034437510611e-06, + "loss": 0.68126726, + "num_input_tokens_seen": 96092425, + "step": 4450, + "time_per_iteration": 2.7818777561187744 + }, + { + "auxiliary_loss_clip": 0.01114368, + "auxiliary_loss_mlp": 0.01084567, + "balance_loss_clip": 1.02841616, + "balance_loss_mlp": 1.00402963, + "epoch": 0.5352011062345938, + "flos": 19499997381120.0, + "grad_norm": 2.4192376206421673, + "language_loss": 0.81470531, + "learning_rate": 1.8692261292156549e-06, + "loss": 0.83669472, + "num_input_tokens_seen": 96111660, + "step": 4451, + "time_per_iteration": 2.6906018257141113 + }, + { + "auxiliary_loss_clip": 0.01140946, + "auxiliary_loss_mlp": 0.01085547, + "balance_loss_clip": 1.02994251, + "balance_loss_mlp": 1.00510502, + "epoch": 0.535321349125233, + "flos": 23477642691840.0, + "grad_norm": 2.0629147584310656, + "language_loss": 0.80926585, + "learning_rate": 1.8684488345185401e-06, + "loss": 0.83153081, + "num_input_tokens_seen": 96131835, + "step": 4452, + "time_per_iteration": 2.6436777114868164 + }, + { + "auxiliary_loss_clip": 0.01140134, + "auxiliary_loss_mlp": 0.0108628, + "balance_loss_clip": 1.02892959, + "balance_loss_mlp": 1.00569463, + "epoch": 0.535441592015872, + "flos": 20478059786880.0, + "grad_norm": 2.3530686313204536, + "language_loss": 0.78815526, + "learning_rate": 1.8676715597776332e-06, + "loss": 0.81041938, + "num_input_tokens_seen": 96150180, + "step": 4453, + "time_per_iteration": 2.5928955078125 + }, + { + "auxiliary_loss_clip": 0.01102671, + "auxiliary_loss_mlp": 0.01083999, + "balance_loss_clip": 1.02618837, + "balance_loss_mlp": 1.00365186, + "epoch": 0.5355618349065111, + "flos": 19573147428480.0, + "grad_norm": 1.950997471675527, + "language_loss": 0.76002705, + "learning_rate": 1.8668943051108455e-06, + "loss": 0.78189373, + "num_input_tokens_seen": 96167485, + "step": 4454, + "time_per_iteration": 2.7842557430267334 + }, + { + "auxiliary_loss_clip": 0.01122911, + "auxiliary_loss_mlp": 0.01084603, + "balance_loss_clip": 1.02836871, + "balance_loss_mlp": 1.00401759, + "epoch": 0.5356820777971503, + "flos": 24024633978240.0, + "grad_norm": 1.6619414142462838, + "language_loss": 0.76124454, + "learning_rate": 1.8661170706360856e-06, + "loss": 0.78331971, + "num_input_tokens_seen": 96186650, + "step": 4455, + "time_per_iteration": 2.713113784790039 + }, + { + "auxiliary_loss_clip": 0.01129448, + "auxiliary_loss_mlp": 0.01084402, + "balance_loss_clip": 1.02778792, + "balance_loss_mlp": 1.00405526, + "epoch": 0.5358023206877893, + "flos": 20884676722560.0, + "grad_norm": 1.561574292210155, + "language_loss": 0.81355536, + "learning_rate": 1.8653398564712594e-06, + "loss": 0.8356939, + "num_input_tokens_seen": 96205595, + "step": 4456, + "time_per_iteration": 2.708287477493286 + }, + { + "auxiliary_loss_clip": 0.01130455, + "auxiliary_loss_mlp": 0.01084998, + "balance_loss_clip": 1.02843285, + "balance_loss_mlp": 1.00465107, + "epoch": 0.5359225635784284, + "flos": 22418996123520.0, + "grad_norm": 2.317412833284651, + "language_loss": 0.82243383, + "learning_rate": 1.8645626627342704e-06, + "loss": 0.8445884, + "num_input_tokens_seen": 96226360, + "step": 4457, + "time_per_iteration": 2.7031373977661133 + }, + { + "auxiliary_loss_clip": 0.01132427, + "auxiliary_loss_mlp": 0.01084389, + "balance_loss_clip": 1.02885818, + "balance_loss_mlp": 1.00389862, + "epoch": 0.5360428064690675, + "flos": 24097784025600.0, + "grad_norm": 2.148324904395349, + "language_loss": 0.80489433, + "learning_rate": 1.8637854895430172e-06, + "loss": 0.82706255, + "num_input_tokens_seen": 96245625, + "step": 4458, + "time_per_iteration": 2.7482986450195312 + }, + { + "auxiliary_loss_clip": 0.0110771, + "auxiliary_loss_mlp": 0.01085063, + "balance_loss_clip": 1.02311826, + "balance_loss_mlp": 1.00447798, + "epoch": 0.5361630493597066, + "flos": 21434505183360.0, + "grad_norm": 2.21162065829546, + "language_loss": 0.69659936, + "learning_rate": 1.8630083370153978e-06, + "loss": 0.71852708, + "num_input_tokens_seen": 96265265, + "step": 4459, + "time_per_iteration": 2.81545090675354 + }, + { + "auxiliary_loss_clip": 0.01080356, + "auxiliary_loss_mlp": 0.01079451, + "balance_loss_clip": 1.02431178, + "balance_loss_mlp": 1.0001055, + "epoch": 0.5362832922503457, + "flos": 68888696520960.0, + "grad_norm": 0.7459560051885255, + "language_loss": 0.55520439, + "learning_rate": 1.8622312052693041e-06, + "loss": 0.57680243, + "num_input_tokens_seen": 96326445, + "step": 4460, + "time_per_iteration": 3.537384033203125 + }, + { + "auxiliary_loss_clip": 0.0113159, + "auxiliary_loss_mlp": 0.01084447, + "balance_loss_clip": 1.02819836, + "balance_loss_mlp": 1.00395727, + "epoch": 0.5364035351409848, + "flos": 9793702563840.0, + "grad_norm": 2.02221392844822, + "language_loss": 0.723315, + "learning_rate": 1.8614540944226267e-06, + "loss": 0.74547535, + "num_input_tokens_seen": 96343115, + "step": 4461, + "time_per_iteration": 2.921132802963257 + }, + { + "auxiliary_loss_clip": 0.01119743, + "auxiliary_loss_mlp": 0.01085796, + "balance_loss_clip": 1.02748084, + "balance_loss_mlp": 1.00530601, + "epoch": 0.5365237780316239, + "flos": 23290080848640.0, + "grad_norm": 2.2396414152197424, + "language_loss": 0.68042386, + "learning_rate": 1.8606770045932537e-06, + "loss": 0.70247924, + "num_input_tokens_seen": 96362230, + "step": 4462, + "time_per_iteration": 3.650635004043579 + }, + { + "auxiliary_loss_clip": 0.01115048, + "auxiliary_loss_mlp": 0.01086288, + "balance_loss_clip": 1.0269115, + "balance_loss_mlp": 1.00575042, + "epoch": 0.5366440209222629, + "flos": 26578133879040.0, + "grad_norm": 1.767044676430498, + "language_loss": 0.81681609, + "learning_rate": 1.859899935899068e-06, + "loss": 0.83882946, + "num_input_tokens_seen": 96382085, + "step": 4463, + "time_per_iteration": 3.799499034881592 + }, + { + "auxiliary_loss_clip": 0.01113693, + "auxiliary_loss_mlp": 0.0108584, + "balance_loss_clip": 1.022879, + "balance_loss_mlp": 1.00534964, + "epoch": 0.5367642638129021, + "flos": 19608052469760.0, + "grad_norm": 1.4381859790054874, + "language_loss": 0.7886095, + "learning_rate": 1.8591228884579506e-06, + "loss": 0.81060481, + "num_input_tokens_seen": 96400580, + "step": 4464, + "time_per_iteration": 2.742522954940796 + }, + { + "auxiliary_loss_clip": 0.01110838, + "auxiliary_loss_mlp": 0.01084579, + "balance_loss_clip": 1.02573943, + "balance_loss_mlp": 1.00423205, + "epoch": 0.5368845067035412, + "flos": 23915214172800.0, + "grad_norm": 2.1041031676631614, + "language_loss": 0.82038826, + "learning_rate": 1.8583458623877795e-06, + "loss": 0.8423425, + "num_input_tokens_seen": 96419680, + "step": 4465, + "time_per_iteration": 2.7352097034454346 + }, + { + "auxiliary_loss_clip": 0.01130801, + "auxiliary_loss_mlp": 0.01085278, + "balance_loss_clip": 1.02770019, + "balance_loss_mlp": 1.00478792, + "epoch": 0.5370047495941802, + "flos": 16873131951360.0, + "grad_norm": 1.8941512473012656, + "language_loss": 0.74338865, + "learning_rate": 1.8575688578064281e-06, + "loss": 0.76554942, + "num_input_tokens_seen": 96437805, + "step": 4466, + "time_per_iteration": 2.67435359954834 + }, + { + "auxiliary_loss_clip": 0.01132722, + "auxiliary_loss_mlp": 0.01084631, + "balance_loss_clip": 1.02970862, + "balance_loss_mlp": 1.00418901, + "epoch": 0.5371249924848194, + "flos": 20740926493440.0, + "grad_norm": 1.7983169859244155, + "language_loss": 0.76719046, + "learning_rate": 1.8567918748317674e-06, + "loss": 0.78936398, + "num_input_tokens_seen": 96457155, + "step": 4467, + "time_per_iteration": 3.64676570892334 + }, + { + "auxiliary_loss_clip": 0.0111517, + "auxiliary_loss_mlp": 0.01086452, + "balance_loss_clip": 1.0274483, + "balance_loss_mlp": 1.00596189, + "epoch": 0.5372452353754584, + "flos": 17968120104960.0, + "grad_norm": 2.0663856666803193, + "language_loss": 0.83293533, + "learning_rate": 1.8560149135816659e-06, + "loss": 0.85495162, + "num_input_tokens_seen": 96473990, + "step": 4468, + "time_per_iteration": 3.670438528060913 + }, + { + "auxiliary_loss_clip": 0.01130837, + "auxiliary_loss_mlp": 0.01084191, + "balance_loss_clip": 1.02733159, + "balance_loss_mlp": 1.00389171, + "epoch": 0.5373654782660975, + "flos": 15377021642880.0, + "grad_norm": 2.5306772569543683, + "language_loss": 0.84604532, + "learning_rate": 1.8552379741739873e-06, + "loss": 0.86819553, + "num_input_tokens_seen": 96491335, + "step": 4469, + "time_per_iteration": 2.686922311782837 + }, + { + "auxiliary_loss_clip": 0.01095765, + "auxiliary_loss_mlp": 0.00873107, + "balance_loss_clip": 1.02377939, + "balance_loss_mlp": 1.00186527, + "epoch": 0.5374857211567367, + "flos": 69000091574400.0, + "grad_norm": 0.8835977337029062, + "language_loss": 0.55693877, + "learning_rate": 1.8544610567265935e-06, + "loss": 0.57662749, + "num_input_tokens_seen": 96545275, + "step": 4470, + "time_per_iteration": 3.261486530303955 + }, + { + "auxiliary_loss_clip": 0.01120056, + "auxiliary_loss_mlp": 0.00873067, + "balance_loss_clip": 1.02802503, + "balance_loss_mlp": 1.00017786, + "epoch": 0.5376059640473757, + "flos": 15085355207040.0, + "grad_norm": 1.9901754650578158, + "language_loss": 0.82998013, + "learning_rate": 1.853684161357341e-06, + "loss": 0.84991133, + "num_input_tokens_seen": 96562935, + "step": 4471, + "time_per_iteration": 2.7229056358337402 + }, + { + "auxiliary_loss_clip": 0.01128404, + "auxiliary_loss_mlp": 0.0087317, + "balance_loss_clip": 1.02710032, + "balance_loss_mlp": 1.00016952, + "epoch": 0.5377262069380148, + "flos": 19792597570560.0, + "grad_norm": 1.658044182673152, + "language_loss": 0.76677299, + "learning_rate": 1.852907288184085e-06, + "loss": 0.7867887, + "num_input_tokens_seen": 96581820, + "step": 4472, + "time_per_iteration": 2.685915946960449 + }, + { + "auxiliary_loss_clip": 0.0109743, + "auxiliary_loss_mlp": 0.01084246, + "balance_loss_clip": 1.02155042, + "balance_loss_mlp": 1.00375605, + "epoch": 0.5378464498286539, + "flos": 30003077640960.0, + "grad_norm": 1.688833685131968, + "language_loss": 0.70023298, + "learning_rate": 1.8521304373246762e-06, + "loss": 0.72204977, + "num_input_tokens_seen": 96602865, + "step": 4473, + "time_per_iteration": 2.8104918003082275 + }, + { + "auxiliary_loss_clip": 0.01131836, + "auxiliary_loss_mlp": 0.0108701, + "balance_loss_clip": 1.02796113, + "balance_loss_mlp": 1.00637627, + "epoch": 0.537966692719293, + "flos": 21251217058560.0, + "grad_norm": 2.1648594533521726, + "language_loss": 0.88295883, + "learning_rate": 1.8513536088969626e-06, + "loss": 0.90514725, + "num_input_tokens_seen": 96620530, + "step": 4474, + "time_per_iteration": 2.6801576614379883 + }, + { + "auxiliary_loss_clip": 0.01130712, + "auxiliary_loss_mlp": 0.01085538, + "balance_loss_clip": 1.02847338, + "balance_loss_mlp": 1.00500059, + "epoch": 0.538086935609932, + "flos": 21543170803200.0, + "grad_norm": 1.5688937377745025, + "language_loss": 0.80364835, + "learning_rate": 1.8505768030187884e-06, + "loss": 0.82581079, + "num_input_tokens_seen": 96640660, + "step": 4475, + "time_per_iteration": 2.641091823577881 + }, + { + "auxiliary_loss_clip": 0.01119756, + "auxiliary_loss_mlp": 0.01084226, + "balance_loss_clip": 1.02805793, + "balance_loss_mlp": 1.00383162, + "epoch": 0.5382071785005712, + "flos": 22747219626240.0, + "grad_norm": 1.5731896516749124, + "language_loss": 0.80155081, + "learning_rate": 1.849800019807995e-06, + "loss": 0.82359064, + "num_input_tokens_seen": 96661885, + "step": 4476, + "time_per_iteration": 2.7452392578125 + }, + { + "auxiliary_loss_clip": 0.01102511, + "auxiliary_loss_mlp": 0.01084572, + "balance_loss_clip": 1.02382731, + "balance_loss_mlp": 1.00412965, + "epoch": 0.5383274213912103, + "flos": 24934574240640.0, + "grad_norm": 1.9027578928992666, + "language_loss": 0.71009737, + "learning_rate": 1.8490232593824186e-06, + "loss": 0.73196822, + "num_input_tokens_seen": 96678340, + "step": 4477, + "time_per_iteration": 2.765552043914795 + }, + { + "auxiliary_loss_clip": 0.01118117, + "auxiliary_loss_mlp": 0.01084439, + "balance_loss_clip": 1.02635622, + "balance_loss_mlp": 1.00399613, + "epoch": 0.5384476642818493, + "flos": 22310186849280.0, + "grad_norm": 1.5497382185730146, + "language_loss": 0.84803641, + "learning_rate": 1.8482465218598935e-06, + "loss": 0.87006199, + "num_input_tokens_seen": 96698285, + "step": 4478, + "time_per_iteration": 2.7101452350616455 + }, + { + "auxiliary_loss_clip": 0.01114239, + "auxiliary_loss_mlp": 0.01084902, + "balance_loss_clip": 1.02733564, + "balance_loss_mlp": 1.00441194, + "epoch": 0.5385679071724885, + "flos": 22711021695360.0, + "grad_norm": 1.917723594880855, + "language_loss": 0.8330164, + "learning_rate": 1.8474698073582508e-06, + "loss": 0.85500789, + "num_input_tokens_seen": 96719655, + "step": 4479, + "time_per_iteration": 2.696549892425537 + }, + { + "auxiliary_loss_clip": 0.01111846, + "auxiliary_loss_mlp": 0.01085309, + "balance_loss_clip": 1.02654052, + "balance_loss_mlp": 1.00477135, + "epoch": 0.5386881500631275, + "flos": 15953746412160.0, + "grad_norm": 1.9512053572724188, + "language_loss": 0.87134385, + "learning_rate": 1.8466931159953166e-06, + "loss": 0.89331543, + "num_input_tokens_seen": 96736290, + "step": 4480, + "time_per_iteration": 2.7844810485839844 + }, + { + "auxiliary_loss_clip": 0.01122863, + "auxiliary_loss_mlp": 0.01085834, + "balance_loss_clip": 1.02889204, + "balance_loss_mlp": 1.00539184, + "epoch": 0.5388083929537666, + "flos": 24060041809920.0, + "grad_norm": 1.6326935850512925, + "language_loss": 0.83913374, + "learning_rate": 1.8459164478889158e-06, + "loss": 0.86122072, + "num_input_tokens_seen": 96757685, + "step": 4481, + "time_per_iteration": 2.775245189666748 + }, + { + "auxiliary_loss_clip": 0.01112542, + "auxiliary_loss_mlp": 0.01086231, + "balance_loss_clip": 1.02634716, + "balance_loss_mlp": 1.00578845, + "epoch": 0.5389286358444056, + "flos": 22236893147520.0, + "grad_norm": 1.5889054709962995, + "language_loss": 0.76220489, + "learning_rate": 1.8451398031568663e-06, + "loss": 0.78419256, + "num_input_tokens_seen": 96777310, + "step": 4482, + "time_per_iteration": 2.687385320663452 + }, + { + "auxiliary_loss_clip": 0.01111426, + "auxiliary_loss_mlp": 0.01086361, + "balance_loss_clip": 1.02679873, + "balance_loss_mlp": 1.00577545, + "epoch": 0.5390488787350448, + "flos": 24281718595200.0, + "grad_norm": 1.6150029678584945, + "language_loss": 0.74608803, + "learning_rate": 1.844363181916986e-06, + "loss": 0.76806593, + "num_input_tokens_seen": 96798035, + "step": 4483, + "time_per_iteration": 2.8646068572998047 + }, + { + "auxiliary_loss_clip": 0.01132659, + "auxiliary_loss_mlp": 0.01085517, + "balance_loss_clip": 1.02867556, + "balance_loss_mlp": 1.0049789, + "epoch": 0.5391691216256839, + "flos": 16581393688320.0, + "grad_norm": 2.5612724132916873, + "language_loss": 0.83079553, + "learning_rate": 1.8435865842870868e-06, + "loss": 0.85297728, + "num_input_tokens_seen": 96815975, + "step": 4484, + "time_per_iteration": 2.636929512023926 + }, + { + "auxiliary_loss_clip": 0.01123066, + "auxiliary_loss_mlp": 0.00873143, + "balance_loss_clip": 1.0272963, + "balance_loss_mlp": 1.00016928, + "epoch": 0.5392893645163229, + "flos": 23330049707520.0, + "grad_norm": 1.8390045481161454, + "language_loss": 0.7207855, + "learning_rate": 1.8428100103849787e-06, + "loss": 0.74074757, + "num_input_tokens_seen": 96835770, + "step": 4485, + "time_per_iteration": 2.7986106872558594 + }, + { + "auxiliary_loss_clip": 0.01118734, + "auxiliary_loss_mlp": 0.01085215, + "balance_loss_clip": 1.02739191, + "balance_loss_mlp": 1.0047251, + "epoch": 0.5394096074069621, + "flos": 15669801400320.0, + "grad_norm": 1.9952830071744718, + "language_loss": 0.73723722, + "learning_rate": 1.842033460328467e-06, + "loss": 0.75927669, + "num_input_tokens_seen": 96854490, + "step": 4486, + "time_per_iteration": 2.686936855316162 + }, + { + "auxiliary_loss_clip": 0.01123732, + "auxiliary_loss_mlp": 0.00873067, + "balance_loss_clip": 1.02861559, + "balance_loss_mlp": 1.00019383, + "epoch": 0.5395298502976011, + "flos": 22893447893760.0, + "grad_norm": 1.7004265763984876, + "language_loss": 0.75380993, + "learning_rate": 1.8412569342353541e-06, + "loss": 0.77377796, + "num_input_tokens_seen": 96874645, + "step": 4487, + "time_per_iteration": 2.810741901397705 + }, + { + "auxiliary_loss_clip": 0.01120281, + "auxiliary_loss_mlp": 0.01084899, + "balance_loss_clip": 1.02644598, + "balance_loss_mlp": 1.00436163, + "epoch": 0.5396500931882402, + "flos": 23842135952640.0, + "grad_norm": 1.9324744590650422, + "language_loss": 0.8457036, + "learning_rate": 1.840480432223438e-06, + "loss": 0.86775541, + "num_input_tokens_seen": 96893650, + "step": 4488, + "time_per_iteration": 4.4862282276153564 + }, + { + "auxiliary_loss_clip": 0.0112267, + "auxiliary_loss_mlp": 0.01085143, + "balance_loss_clip": 1.02794635, + "balance_loss_mlp": 1.00474846, + "epoch": 0.5397703360788794, + "flos": 26322988596480.0, + "grad_norm": 2.4478329265095247, + "language_loss": 0.77783138, + "learning_rate": 1.8397039544105131e-06, + "loss": 0.79990953, + "num_input_tokens_seen": 96912735, + "step": 4489, + "time_per_iteration": 2.7313249111175537 + }, + { + "auxiliary_loss_clip": 0.01122274, + "auxiliary_loss_mlp": 0.01084772, + "balance_loss_clip": 1.02662194, + "balance_loss_mlp": 1.004282, + "epoch": 0.5398905789695184, + "flos": 21214588164480.0, + "grad_norm": 1.5959295263568198, + "language_loss": 0.6959703, + "learning_rate": 1.8389275009143711e-06, + "loss": 0.71804082, + "num_input_tokens_seen": 96932475, + "step": 4490, + "time_per_iteration": 2.8267271518707275 + }, + { + "auxiliary_loss_clip": 0.01140485, + "auxiliary_loss_mlp": 0.01085061, + "balance_loss_clip": 1.02907777, + "balance_loss_mlp": 1.00461888, + "epoch": 0.5400108218601575, + "flos": 25080335631360.0, + "grad_norm": 1.6376435194795351, + "language_loss": 0.73662126, + "learning_rate": 1.8381510718527988e-06, + "loss": 0.7588768, + "num_input_tokens_seen": 96952085, + "step": 4491, + "time_per_iteration": 2.6604268550872803 + }, + { + "auxiliary_loss_clip": 0.01122453, + "auxiliary_loss_mlp": 0.01086382, + "balance_loss_clip": 1.02730107, + "balance_loss_mlp": 1.00589192, + "epoch": 0.5401310647507966, + "flos": 26357498588160.0, + "grad_norm": 1.9633515911538784, + "language_loss": 0.63407779, + "learning_rate": 1.8373746673435812e-06, + "loss": 0.65616614, + "num_input_tokens_seen": 96973110, + "step": 4492, + "time_per_iteration": 3.8232429027557373 + }, + { + "auxiliary_loss_clip": 0.01139914, + "auxiliary_loss_mlp": 0.01085167, + "balance_loss_clip": 1.0290029, + "balance_loss_mlp": 1.00462914, + "epoch": 0.5402513076414357, + "flos": 27855332749440.0, + "grad_norm": 1.6071997266777953, + "language_loss": 0.7891562, + "learning_rate": 1.8365982875044964e-06, + "loss": 0.81140709, + "num_input_tokens_seen": 96993420, + "step": 4493, + "time_per_iteration": 3.6194560527801514 + }, + { + "auxiliary_loss_clip": 0.01131335, + "auxiliary_loss_mlp": 0.00873246, + "balance_loss_clip": 1.02837527, + "balance_loss_mlp": 1.00014949, + "epoch": 0.5403715505320748, + "flos": 22893771116160.0, + "grad_norm": 2.004215772103677, + "language_loss": 0.75904489, + "learning_rate": 1.8358219324533217e-06, + "loss": 0.77909076, + "num_input_tokens_seen": 97013685, + "step": 4494, + "time_per_iteration": 2.664267063140869 + }, + { + "auxiliary_loss_clip": 0.01120713, + "auxiliary_loss_mlp": 0.01084257, + "balance_loss_clip": 1.02728105, + "balance_loss_mlp": 1.00400555, + "epoch": 0.5404917934227139, + "flos": 30224143895040.0, + "grad_norm": 1.5126351081963663, + "language_loss": 0.70380437, + "learning_rate": 1.8350456023078292e-06, + "loss": 0.72585404, + "num_input_tokens_seen": 97036060, + "step": 4495, + "time_per_iteration": 2.7740542888641357 + }, + { + "auxiliary_loss_clip": 0.01141131, + "auxiliary_loss_mlp": 0.01084841, + "balance_loss_clip": 1.02870739, + "balance_loss_mlp": 1.00435114, + "epoch": 0.540612036313353, + "flos": 19938502615680.0, + "grad_norm": 2.2067708238197574, + "language_loss": 0.77841151, + "learning_rate": 1.8342692971857874e-06, + "loss": 0.80067122, + "num_input_tokens_seen": 97055260, + "step": 4496, + "time_per_iteration": 2.618220806121826 + }, + { + "auxiliary_loss_clip": 0.01116981, + "auxiliary_loss_mlp": 0.01085167, + "balance_loss_clip": 1.02507114, + "balance_loss_mlp": 1.00472426, + "epoch": 0.540732279203992, + "flos": 24279599692800.0, + "grad_norm": 2.0846014108392836, + "language_loss": 0.71001184, + "learning_rate": 1.833493017204962e-06, + "loss": 0.73203331, + "num_input_tokens_seen": 97075365, + "step": 4497, + "time_per_iteration": 2.7150182723999023 + }, + { + "auxiliary_loss_clip": 0.01140658, + "auxiliary_loss_mlp": 0.01084557, + "balance_loss_clip": 1.0293448, + "balance_loss_mlp": 1.00397146, + "epoch": 0.5408525220946312, + "flos": 20193216935040.0, + "grad_norm": 2.960547475934261, + "language_loss": 0.77762997, + "learning_rate": 1.8327167624831134e-06, + "loss": 0.79988217, + "num_input_tokens_seen": 97093095, + "step": 4498, + "time_per_iteration": 2.620100259780884 + }, + { + "auxiliary_loss_clip": 0.01140172, + "auxiliary_loss_mlp": 0.0108515, + "balance_loss_clip": 1.0291307, + "balance_loss_mlp": 1.0047071, + "epoch": 0.5409727649852702, + "flos": 24134448833280.0, + "grad_norm": 1.6527581384012997, + "language_loss": 0.70810318, + "learning_rate": 1.831940533137999e-06, + "loss": 0.7303564, + "num_input_tokens_seen": 97112000, + "step": 4499, + "time_per_iteration": 2.649808883666992 + }, + { + "auxiliary_loss_clip": 0.0112401, + "auxiliary_loss_mlp": 0.01086242, + "balance_loss_clip": 1.02361131, + "balance_loss_mlp": 1.00584722, + "epoch": 0.5410930078759093, + "flos": 23912700220800.0, + "grad_norm": 1.7073117751251228, + "language_loss": 0.72257173, + "learning_rate": 1.8311643292873718e-06, + "loss": 0.74467432, + "num_input_tokens_seen": 97130820, + "step": 4500, + "time_per_iteration": 2.7346489429473877 + }, + { + "auxiliary_loss_clip": 0.01128364, + "auxiliary_loss_mlp": 0.01086166, + "balance_loss_clip": 1.02721167, + "balance_loss_mlp": 1.0057714, + "epoch": 0.5412132507665485, + "flos": 21105132445440.0, + "grad_norm": 1.7552565673943812, + "language_loss": 0.88086683, + "learning_rate": 1.8303881510489818e-06, + "loss": 0.9030121, + "num_input_tokens_seen": 97149210, + "step": 4501, + "time_per_iteration": 2.7675437927246094 + }, + { + "auxiliary_loss_clip": 0.01118448, + "auxiliary_loss_mlp": 0.01085513, + "balance_loss_clip": 1.02548099, + "balance_loss_mlp": 1.00497532, + "epoch": 0.5413334936571875, + "flos": 30227340205440.0, + "grad_norm": 1.8087626875367306, + "language_loss": 0.69012797, + "learning_rate": 1.829611998540574e-06, + "loss": 0.71216762, + "num_input_tokens_seen": 97170415, + "step": 4502, + "time_per_iteration": 2.795602560043335 + }, + { + "auxiliary_loss_clip": 0.01128012, + "auxiliary_loss_mlp": 0.00873152, + "balance_loss_clip": 1.02638125, + "balance_loss_mlp": 1.00022507, + "epoch": 0.5414537365478266, + "flos": 24279635606400.0, + "grad_norm": 3.0958202379080064, + "language_loss": 0.79571199, + "learning_rate": 1.8288358718798914e-06, + "loss": 0.81572366, + "num_input_tokens_seen": 97189605, + "step": 4503, + "time_per_iteration": 2.6338772773742676 + }, + { + "auxiliary_loss_clip": 0.01122965, + "auxiliary_loss_mlp": 0.00873113, + "balance_loss_clip": 1.0270505, + "balance_loss_mlp": 1.0002538, + "epoch": 0.5415739794384657, + "flos": 16654543735680.0, + "grad_norm": 1.6324111643579506, + "language_loss": 0.72470587, + "learning_rate": 1.8280597711846703e-06, + "loss": 0.7446667, + "num_input_tokens_seen": 97207845, + "step": 4504, + "time_per_iteration": 2.6501517295837402 + }, + { + "auxiliary_loss_clip": 0.01124395, + "auxiliary_loss_mlp": 0.0108443, + "balance_loss_clip": 1.02411723, + "balance_loss_mlp": 1.00408292, + "epoch": 0.5416942223291048, + "flos": 23185724860800.0, + "grad_norm": 2.1174165838044097, + "language_loss": 0.83469427, + "learning_rate": 1.8272836965726455e-06, + "loss": 0.8567825, + "num_input_tokens_seen": 97226780, + "step": 4505, + "time_per_iteration": 2.655355215072632 + }, + { + "auxiliary_loss_clip": 0.01095247, + "auxiliary_loss_mlp": 0.01085036, + "balance_loss_clip": 1.02565479, + "balance_loss_mlp": 1.00454545, + "epoch": 0.5418144652197439, + "flos": 20303247271680.0, + "grad_norm": 1.7595971118642435, + "language_loss": 0.78050196, + "learning_rate": 1.8265076481615461e-06, + "loss": 0.80230474, + "num_input_tokens_seen": 97246695, + "step": 4506, + "time_per_iteration": 2.914834499359131 + }, + { + "auxiliary_loss_clip": 0.01121993, + "auxiliary_loss_mlp": 0.01085672, + "balance_loss_clip": 1.02884531, + "balance_loss_mlp": 1.00508606, + "epoch": 0.541934708110383, + "flos": 12458633431680.0, + "grad_norm": 2.235130926582984, + "language_loss": 0.87324589, + "learning_rate": 1.8257316260690987e-06, + "loss": 0.89532256, + "num_input_tokens_seen": 97264480, + "step": 4507, + "time_per_iteration": 2.675929307937622 + }, + { + "auxiliary_loss_clip": 0.01114938, + "auxiliary_loss_mlp": 0.01084349, + "balance_loss_clip": 1.02836752, + "balance_loss_mlp": 1.00404978, + "epoch": 0.5420549510010221, + "flos": 21253802837760.0, + "grad_norm": 1.5167800435675765, + "language_loss": 0.75940573, + "learning_rate": 1.8249556304130254e-06, + "loss": 0.78139865, + "num_input_tokens_seen": 97285760, + "step": 4508, + "time_per_iteration": 2.6872732639312744 + }, + { + "auxiliary_loss_clip": 0.01121239, + "auxiliary_loss_mlp": 0.01085859, + "balance_loss_clip": 1.02680588, + "balance_loss_mlp": 1.00551176, + "epoch": 0.5421751938916611, + "flos": 29490524519040.0, + "grad_norm": 2.105087295676966, + "language_loss": 0.68822742, + "learning_rate": 1.824179661311044e-06, + "loss": 0.71029836, + "num_input_tokens_seen": 97304510, + "step": 4509, + "time_per_iteration": 2.7565407752990723 + }, + { + "auxiliary_loss_clip": 0.01102737, + "auxiliary_loss_mlp": 0.01085465, + "balance_loss_clip": 1.02571678, + "balance_loss_mlp": 1.005023, + "epoch": 0.5422954367823003, + "flos": 18734238311040.0, + "grad_norm": 1.8186661286168007, + "language_loss": 0.80086619, + "learning_rate": 1.823403718880868e-06, + "loss": 0.82274824, + "num_input_tokens_seen": 97323270, + "step": 4510, + "time_per_iteration": 2.780787944793701 + }, + { + "auxiliary_loss_clip": 0.01124141, + "auxiliary_loss_mlp": 0.01084473, + "balance_loss_clip": 1.02814436, + "balance_loss_mlp": 1.00407791, + "epoch": 0.5424156796729394, + "flos": 39969006940800.0, + "grad_norm": 1.5872699514031785, + "language_loss": 0.66674447, + "learning_rate": 1.822627803240207e-06, + "loss": 0.68883067, + "num_input_tokens_seen": 97345600, + "step": 4511, + "time_per_iteration": 2.877390146255493 + }, + { + "auxiliary_loss_clip": 0.0111135, + "auxiliary_loss_mlp": 0.01085357, + "balance_loss_clip": 1.02558482, + "balance_loss_mlp": 1.00491428, + "epoch": 0.5425359225635784, + "flos": 11546538353280.0, + "grad_norm": 2.453439123904123, + "language_loss": 0.85157716, + "learning_rate": 1.8218519145067675e-06, + "loss": 0.87354428, + "num_input_tokens_seen": 97361220, + "step": 4512, + "time_per_iteration": 2.7040460109710693 + }, + { + "auxiliary_loss_clip": 0.01105954, + "auxiliary_loss_mlp": 0.01085885, + "balance_loss_clip": 1.02563858, + "balance_loss_mlp": 1.00534761, + "epoch": 0.5426561654542175, + "flos": 20229702174720.0, + "grad_norm": 1.7399488160315828, + "language_loss": 0.89447743, + "learning_rate": 1.8210760527982508e-06, + "loss": 0.91639584, + "num_input_tokens_seen": 97381505, + "step": 4513, + "time_per_iteration": 2.8387627601623535 + }, + { + "auxiliary_loss_clip": 0.01122791, + "auxiliary_loss_mlp": 0.00873064, + "balance_loss_clip": 1.02935386, + "balance_loss_mlp": 1.00020158, + "epoch": 0.5427764083448566, + "flos": 21871681614720.0, + "grad_norm": 1.9562292981788212, + "language_loss": 0.74809456, + "learning_rate": 1.8203002182323552e-06, + "loss": 0.76805317, + "num_input_tokens_seen": 97399060, + "step": 4514, + "time_per_iteration": 3.6489269733428955 + }, + { + "auxiliary_loss_clip": 0.01115801, + "auxiliary_loss_mlp": 0.01085239, + "balance_loss_clip": 1.02310157, + "balance_loss_mlp": 1.00489187, + "epoch": 0.5428966512354957, + "flos": 19640946349440.0, + "grad_norm": 1.821453198684949, + "language_loss": 0.75477648, + "learning_rate": 1.819524410926773e-06, + "loss": 0.77678692, + "num_input_tokens_seen": 97416740, + "step": 4515, + "time_per_iteration": 2.718773365020752 + }, + { + "auxiliary_loss_clip": 0.01083388, + "auxiliary_loss_mlp": 0.01085491, + "balance_loss_clip": 1.02293539, + "balance_loss_mlp": 1.00504887, + "epoch": 0.5430168941261347, + "flos": 22382187661440.0, + "grad_norm": 1.5197559639405576, + "language_loss": 0.77132195, + "learning_rate": 1.8187486309991944e-06, + "loss": 0.79301077, + "num_input_tokens_seen": 97437620, + "step": 4516, + "time_per_iteration": 2.8829407691955566 + }, + { + "auxiliary_loss_clip": 0.0111573, + "auxiliary_loss_mlp": 0.01085464, + "balance_loss_clip": 1.02821589, + "balance_loss_mlp": 1.00516415, + "epoch": 0.5431371370167739, + "flos": 18764187275520.0, + "grad_norm": 1.574018574691992, + "language_loss": 0.77271402, + "learning_rate": 1.817972878567304e-06, + "loss": 0.79472589, + "num_input_tokens_seen": 97456275, + "step": 4517, + "time_per_iteration": 3.621542453765869 + }, + { + "auxiliary_loss_clip": 0.01122046, + "auxiliary_loss_mlp": 0.01084871, + "balance_loss_clip": 1.02718318, + "balance_loss_mlp": 1.00442839, + "epoch": 0.543257379907413, + "flos": 18806023641600.0, + "grad_norm": 1.7060760986954115, + "language_loss": 0.76127529, + "learning_rate": 1.8171971537487834e-06, + "loss": 0.78334439, + "num_input_tokens_seen": 97474925, + "step": 4518, + "time_per_iteration": 2.7399260997772217 + }, + { + "auxiliary_loss_clip": 0.01139627, + "auxiliary_loss_mlp": 0.01084761, + "balance_loss_clip": 1.02788842, + "balance_loss_mlp": 1.00431824, + "epoch": 0.543377622798052, + "flos": 17493381025920.0, + "grad_norm": 1.813307471036553, + "language_loss": 0.80369335, + "learning_rate": 1.8164214566613093e-06, + "loss": 0.82593727, + "num_input_tokens_seen": 97493550, + "step": 4519, + "time_per_iteration": 3.5767791271209717 + }, + { + "auxiliary_loss_clip": 0.01139647, + "auxiliary_loss_mlp": 0.01084047, + "balance_loss_clip": 1.02857637, + "balance_loss_mlp": 1.00360417, + "epoch": 0.5434978656886912, + "flos": 18989311766400.0, + "grad_norm": 2.8515432470599404, + "language_loss": 0.65771985, + "learning_rate": 1.8156457874225547e-06, + "loss": 0.67995679, + "num_input_tokens_seen": 97512010, + "step": 4520, + "time_per_iteration": 2.6003353595733643 + }, + { + "auxiliary_loss_clip": 0.01112263, + "auxiliary_loss_mlp": 0.01085071, + "balance_loss_clip": 1.0259012, + "balance_loss_mlp": 1.00462818, + "epoch": 0.5436181085793302, + "flos": 17274936464640.0, + "grad_norm": 1.6985248007426899, + "language_loss": 0.8117069, + "learning_rate": 1.814870146150187e-06, + "loss": 0.83368027, + "num_input_tokens_seen": 97530120, + "step": 4521, + "time_per_iteration": 2.6986083984375 + }, + { + "auxiliary_loss_clip": 0.01107878, + "auxiliary_loss_mlp": 0.01084987, + "balance_loss_clip": 1.02868259, + "balance_loss_mlp": 1.00459266, + "epoch": 0.5437383514699693, + "flos": 19098587917440.0, + "grad_norm": 1.9052557414985172, + "language_loss": 0.78907913, + "learning_rate": 1.814094532961871e-06, + "loss": 0.81100774, + "num_input_tokens_seen": 97548695, + "step": 4522, + "time_per_iteration": 2.616985321044922 + }, + { + "auxiliary_loss_clip": 0.01103246, + "auxiliary_loss_mlp": 0.01085027, + "balance_loss_clip": 1.02508926, + "balance_loss_mlp": 1.00458431, + "epoch": 0.5438585943606085, + "flos": 22602715211520.0, + "grad_norm": 2.2081419570069927, + "language_loss": 0.83680612, + "learning_rate": 1.8133189479752666e-06, + "loss": 0.85868883, + "num_input_tokens_seen": 97567625, + "step": 4523, + "time_per_iteration": 2.795588493347168 + }, + { + "auxiliary_loss_clip": 0.01140863, + "auxiliary_loss_mlp": 0.01084698, + "balance_loss_clip": 1.02954364, + "balance_loss_mlp": 1.00430286, + "epoch": 0.5439788372512475, + "flos": 21798495653760.0, + "grad_norm": 1.9368792100108112, + "language_loss": 0.81597495, + "learning_rate": 1.8125433913080292e-06, + "loss": 0.83823061, + "num_input_tokens_seen": 97585325, + "step": 4524, + "time_per_iteration": 2.663801431655884 + }, + { + "auxiliary_loss_clip": 0.01052719, + "auxiliary_loss_mlp": 0.01084401, + "balance_loss_clip": 1.01845706, + "balance_loss_mlp": 1.0041492, + "epoch": 0.5440990801418866, + "flos": 16399362539520.0, + "grad_norm": 25.756142055566283, + "language_loss": 0.82429683, + "learning_rate": 1.811767863077811e-06, + "loss": 0.84566802, + "num_input_tokens_seen": 97604275, + "step": 4525, + "time_per_iteration": 3.0841922760009766 + }, + { + "auxiliary_loss_clip": 0.01082003, + "auxiliary_loss_mlp": 0.0108406, + "balance_loss_clip": 1.02266598, + "balance_loss_mlp": 1.00371253, + "epoch": 0.5442193230325257, + "flos": 21615638492160.0, + "grad_norm": 1.7046835766605866, + "language_loss": 0.77995336, + "learning_rate": 1.8109923634022577e-06, + "loss": 0.80161405, + "num_input_tokens_seen": 97624300, + "step": 4526, + "time_per_iteration": 3.044506311416626 + }, + { + "auxiliary_loss_clip": 0.01140419, + "auxiliary_loss_mlp": 0.01084992, + "balance_loss_clip": 1.0283407, + "balance_loss_mlp": 1.0045501, + "epoch": 0.5443395659231648, + "flos": 15481198062720.0, + "grad_norm": 2.0770775440463347, + "language_loss": 0.86673856, + "learning_rate": 1.8102168923990128e-06, + "loss": 0.88899267, + "num_input_tokens_seen": 97637845, + "step": 4527, + "time_per_iteration": 2.610095977783203 + }, + { + "auxiliary_loss_clip": 0.0113114, + "auxiliary_loss_mlp": 0.00872966, + "balance_loss_clip": 1.02846622, + "balance_loss_mlp": 1.00027061, + "epoch": 0.5444598088138038, + "flos": 18770436241920.0, + "grad_norm": 2.0511844795553715, + "language_loss": 0.8020699, + "learning_rate": 1.809441450185714e-06, + "loss": 0.82211101, + "num_input_tokens_seen": 97656330, + "step": 4528, + "time_per_iteration": 2.7148547172546387 + }, + { + "auxiliary_loss_clip": 0.01122033, + "auxiliary_loss_mlp": 0.01084344, + "balance_loss_clip": 1.02666855, + "balance_loss_mlp": 1.00390184, + "epoch": 0.544580051704443, + "flos": 21142335957120.0, + "grad_norm": 2.4002722954550415, + "language_loss": 0.7300604, + "learning_rate": 1.8086660368799958e-06, + "loss": 0.75212419, + "num_input_tokens_seen": 97674380, + "step": 4529, + "time_per_iteration": 2.7008092403411865 + }, + { + "auxiliary_loss_clip": 0.01119615, + "auxiliary_loss_mlp": 0.01085243, + "balance_loss_clip": 1.02620029, + "balance_loss_mlp": 1.00465703, + "epoch": 0.5447002945950821, + "flos": 32491508054400.0, + "grad_norm": 1.6045051201754739, + "language_loss": 0.77506369, + "learning_rate": 1.807890652599488e-06, + "loss": 0.79711223, + "num_input_tokens_seen": 97698765, + "step": 4530, + "time_per_iteration": 2.864271640777588 + }, + { + "auxiliary_loss_clip": 0.01139623, + "auxiliary_loss_mlp": 0.01085914, + "balance_loss_clip": 1.0285151, + "balance_loss_mlp": 1.0055666, + "epoch": 0.5448205374857211, + "flos": 11798307757440.0, + "grad_norm": 1.921411286681614, + "language_loss": 0.82658148, + "learning_rate": 1.8071152974618156e-06, + "loss": 0.84883684, + "num_input_tokens_seen": 97716565, + "step": 4531, + "time_per_iteration": 2.5932226181030273 + }, + { + "auxiliary_loss_clip": 0.01111081, + "auxiliary_loss_mlp": 0.00873093, + "balance_loss_clip": 1.02591908, + "balance_loss_mlp": 1.00020647, + "epoch": 0.5449407803763603, + "flos": 24133766474880.0, + "grad_norm": 2.126027007342578, + "language_loss": 0.78776002, + "learning_rate": 1.806339971584599e-06, + "loss": 0.80760175, + "num_input_tokens_seen": 97733225, + "step": 4532, + "time_per_iteration": 2.7647292613983154 + }, + { + "auxiliary_loss_clip": 0.01139353, + "auxiliary_loss_mlp": 0.01084754, + "balance_loss_clip": 1.02809596, + "balance_loss_mlp": 1.00431132, + "epoch": 0.5450610232669993, + "flos": 23258551685760.0, + "grad_norm": 1.6299165803839983, + "language_loss": 0.85268861, + "learning_rate": 1.8055646750854546e-06, + "loss": 0.87492967, + "num_input_tokens_seen": 97752735, + "step": 4533, + "time_per_iteration": 2.611274242401123 + }, + { + "auxiliary_loss_clip": 0.01121502, + "auxiliary_loss_mlp": 0.01086107, + "balance_loss_clip": 1.02729702, + "balance_loss_mlp": 1.00571215, + "epoch": 0.5451812661576384, + "flos": 17785083375360.0, + "grad_norm": 2.5928544753881075, + "language_loss": 0.81476432, + "learning_rate": 1.8047894080819945e-06, + "loss": 0.83684039, + "num_input_tokens_seen": 97769985, + "step": 4534, + "time_per_iteration": 2.7127437591552734 + }, + { + "auxiliary_loss_clip": 0.01119942, + "auxiliary_loss_mlp": 0.01079188, + "balance_loss_clip": 1.02362156, + "balance_loss_mlp": 1.00022364, + "epoch": 0.5453015090482776, + "flos": 71062586513280.0, + "grad_norm": 0.7307581358473919, + "language_loss": 0.6327346, + "learning_rate": 1.8040141706918258e-06, + "loss": 0.65472597, + "num_input_tokens_seen": 97831225, + "step": 4535, + "time_per_iteration": 3.3344130516052246 + }, + { + "auxiliary_loss_clip": 0.0110644, + "auxiliary_loss_mlp": 0.01085144, + "balance_loss_clip": 1.02822924, + "balance_loss_mlp": 1.00470161, + "epoch": 0.5454217519389166, + "flos": 25552201622400.0, + "grad_norm": 1.626882504229512, + "language_loss": 0.76998198, + "learning_rate": 1.8032389630325525e-06, + "loss": 0.79189783, + "num_input_tokens_seen": 97849975, + "step": 4536, + "time_per_iteration": 2.741544246673584 + }, + { + "auxiliary_loss_clip": 0.01122049, + "auxiliary_loss_mlp": 0.01085077, + "balance_loss_clip": 1.02642739, + "balance_loss_mlp": 1.00468254, + "epoch": 0.5455419948295557, + "flos": 23658345037440.0, + "grad_norm": 1.6665112435968619, + "language_loss": 0.75574255, + "learning_rate": 1.8024637852217707e-06, + "loss": 0.77781385, + "num_input_tokens_seen": 97869700, + "step": 4537, + "time_per_iteration": 2.7442538738250732 + }, + { + "auxiliary_loss_clip": 0.01120307, + "auxiliary_loss_mlp": 0.01085698, + "balance_loss_clip": 1.02651691, + "balance_loss_mlp": 1.00539863, + "epoch": 0.5456622377201948, + "flos": 23403989854080.0, + "grad_norm": 1.6710729744889599, + "language_loss": 0.84831417, + "learning_rate": 1.8016886373770766e-06, + "loss": 0.8703742, + "num_input_tokens_seen": 97888215, + "step": 4538, + "time_per_iteration": 2.6701059341430664 + }, + { + "auxiliary_loss_clip": 0.01125314, + "auxiliary_loss_mlp": 0.01085556, + "balance_loss_clip": 1.02977896, + "balance_loss_mlp": 1.00516081, + "epoch": 0.5457824806108339, + "flos": 23988040997760.0, + "grad_norm": 2.1853080787528643, + "language_loss": 0.79002845, + "learning_rate": 1.8009135196160579e-06, + "loss": 0.81213719, + "num_input_tokens_seen": 97907090, + "step": 4539, + "time_per_iteration": 4.712982177734375 + }, + { + "auxiliary_loss_clip": 0.0111303, + "auxiliary_loss_mlp": 0.01086053, + "balance_loss_clip": 1.02734864, + "balance_loss_mlp": 1.00561047, + "epoch": 0.545902723501473, + "flos": 22565870835840.0, + "grad_norm": 1.620091541170353, + "language_loss": 0.84184468, + "learning_rate": 1.8001384320563e-06, + "loss": 0.86383551, + "num_input_tokens_seen": 97927345, + "step": 4540, + "time_per_iteration": 2.799485445022583 + }, + { + "auxiliary_loss_clip": 0.01120375, + "auxiliary_loss_mlp": 0.01079258, + "balance_loss_clip": 1.02401161, + "balance_loss_mlp": 1.00029397, + "epoch": 0.5460229663921121, + "flos": 55198399685760.0, + "grad_norm": 0.7869305467810521, + "language_loss": 0.57749116, + "learning_rate": 1.7993633748153833e-06, + "loss": 0.59948748, + "num_input_tokens_seen": 97981950, + "step": 4541, + "time_per_iteration": 3.0746192932128906 + }, + { + "auxiliary_loss_clip": 0.01132954, + "auxiliary_loss_mlp": 0.01085605, + "balance_loss_clip": 1.0291636, + "balance_loss_mlp": 1.00516224, + "epoch": 0.5461432092827512, + "flos": 15413866018560.0, + "grad_norm": 1.7860124628354512, + "language_loss": 0.7309655, + "learning_rate": 1.7985883480108834e-06, + "loss": 0.75315112, + "num_input_tokens_seen": 97999585, + "step": 4542, + "time_per_iteration": 2.684311628341675 + }, + { + "auxiliary_loss_clip": 0.01133399, + "auxiliary_loss_mlp": 0.01084128, + "balance_loss_clip": 1.02939498, + "balance_loss_mlp": 1.00363803, + "epoch": 0.5462634521733902, + "flos": 24024921287040.0, + "grad_norm": 1.8365610753093005, + "language_loss": 0.72124469, + "learning_rate": 1.797813351760371e-06, + "loss": 0.74342, + "num_input_tokens_seen": 98021290, + "step": 4543, + "time_per_iteration": 3.874802589416504 + }, + { + "auxiliary_loss_clip": 0.01139181, + "auxiliary_loss_mlp": 0.01085638, + "balance_loss_clip": 1.0276866, + "balance_loss_mlp": 1.00524306, + "epoch": 0.5463836950640293, + "flos": 22820944291200.0, + "grad_norm": 1.6766148280731594, + "language_loss": 0.78385735, + "learning_rate": 1.7970383861814116e-06, + "loss": 0.80610561, + "num_input_tokens_seen": 98041060, + "step": 4544, + "time_per_iteration": 2.6791188716888428 + }, + { + "auxiliary_loss_clip": 0.01132642, + "auxiliary_loss_mlp": 0.01084299, + "balance_loss_clip": 1.03016043, + "balance_loss_mlp": 1.00380898, + "epoch": 0.5465039379546685, + "flos": 20448290390400.0, + "grad_norm": 1.9220176727868064, + "language_loss": 0.74094772, + "learning_rate": 1.7962634513915684e-06, + "loss": 0.76311713, + "num_input_tokens_seen": 98058410, + "step": 4545, + "time_per_iteration": 3.6211678981781006 + }, + { + "auxiliary_loss_clip": 0.01140915, + "auxiliary_loss_mlp": 0.01084803, + "balance_loss_clip": 1.0292393, + "balance_loss_mlp": 1.00436056, + "epoch": 0.5466241808453075, + "flos": 17343310003200.0, + "grad_norm": 1.7437861956532403, + "language_loss": 0.79508495, + "learning_rate": 1.7954885475083969e-06, + "loss": 0.81734216, + "num_input_tokens_seen": 98076080, + "step": 4546, + "time_per_iteration": 2.5673370361328125 + }, + { + "auxiliary_loss_clip": 0.0114163, + "auxiliary_loss_mlp": 0.01084635, + "balance_loss_clip": 1.02981329, + "balance_loss_mlp": 1.00419307, + "epoch": 0.5467444237359466, + "flos": 21617039122560.0, + "grad_norm": 2.055405155693313, + "language_loss": 0.72531474, + "learning_rate": 1.7947136746494513e-06, + "loss": 0.74757737, + "num_input_tokens_seen": 98096995, + "step": 4547, + "time_per_iteration": 2.654984474182129 + }, + { + "auxiliary_loss_clip": 0.01131159, + "auxiliary_loss_mlp": 0.01084081, + "balance_loss_clip": 1.02897036, + "balance_loss_mlp": 1.0036391, + "epoch": 0.5468646666265857, + "flos": 24170467196160.0, + "grad_norm": 1.7847781536026126, + "language_loss": 0.8781867, + "learning_rate": 1.793938832932277e-06, + "loss": 0.90033919, + "num_input_tokens_seen": 98115105, + "step": 4548, + "time_per_iteration": 2.642691135406494 + }, + { + "auxiliary_loss_clip": 0.01139456, + "auxiliary_loss_mlp": 0.01083905, + "balance_loss_clip": 1.02788687, + "balance_loss_mlp": 1.00365329, + "epoch": 0.5469849095172248, + "flos": 27527001505920.0, + "grad_norm": 2.088249709431649, + "language_loss": 0.7003113, + "learning_rate": 1.7931640224744185e-06, + "loss": 0.72254491, + "num_input_tokens_seen": 98135655, + "step": 4549, + "time_per_iteration": 2.6546099185943604 + }, + { + "auxiliary_loss_clip": 0.01113595, + "auxiliary_loss_mlp": 0.01084008, + "balance_loss_clip": 1.02674794, + "balance_loss_mlp": 1.00366068, + "epoch": 0.5471051524078638, + "flos": 27964680727680.0, + "grad_norm": 1.4883830176311612, + "language_loss": 0.73532212, + "learning_rate": 1.7923892433934127e-06, + "loss": 0.75729817, + "num_input_tokens_seen": 98156730, + "step": 4550, + "time_per_iteration": 2.776852607727051 + }, + { + "auxiliary_loss_clip": 0.01122501, + "auxiliary_loss_mlp": 0.00873166, + "balance_loss_clip": 1.02885532, + "balance_loss_mlp": 1.00017214, + "epoch": 0.547225395298503, + "flos": 18150510389760.0, + "grad_norm": 2.101521474941864, + "language_loss": 0.79037488, + "learning_rate": 1.7916144958067939e-06, + "loss": 0.81033158, + "num_input_tokens_seen": 98174590, + "step": 4551, + "time_per_iteration": 2.7228593826293945 + }, + { + "auxiliary_loss_clip": 0.01130567, + "auxiliary_loss_mlp": 0.0108457, + "balance_loss_clip": 1.02733719, + "balance_loss_mlp": 1.00412726, + "epoch": 0.5473456381891421, + "flos": 21361498790400.0, + "grad_norm": 2.2417535210545076, + "language_loss": 0.79117346, + "learning_rate": 1.7908397798320905e-06, + "loss": 0.81332481, + "num_input_tokens_seen": 98194325, + "step": 4552, + "time_per_iteration": 2.6730453968048096 + }, + { + "auxiliary_loss_clip": 0.01134467, + "auxiliary_loss_mlp": 0.00873135, + "balance_loss_clip": 1.03044009, + "balance_loss_mlp": 1.00015795, + "epoch": 0.5474658810797811, + "flos": 19932145908480.0, + "grad_norm": 1.743273152333917, + "language_loss": 0.75111616, + "learning_rate": 1.7900650955868265e-06, + "loss": 0.77119219, + "num_input_tokens_seen": 98213970, + "step": 4553, + "time_per_iteration": 2.6987409591674805 + }, + { + "auxiliary_loss_clip": 0.01131567, + "auxiliary_loss_mlp": 0.00873117, + "balance_loss_clip": 1.02964139, + "balance_loss_mlp": 1.00029445, + "epoch": 0.5475861239704203, + "flos": 50476217264640.0, + "grad_norm": 1.359523050305319, + "language_loss": 0.76542073, + "learning_rate": 1.7892904431885202e-06, + "loss": 0.78546757, + "num_input_tokens_seen": 98241145, + "step": 4554, + "time_per_iteration": 3.051227569580078 + }, + { + "auxiliary_loss_clip": 0.01103019, + "auxiliary_loss_mlp": 0.01085112, + "balance_loss_clip": 1.02576709, + "balance_loss_mlp": 1.00481248, + "epoch": 0.5477063668610593, + "flos": 20705123612160.0, + "grad_norm": 1.654715777002488, + "language_loss": 0.7555989, + "learning_rate": 1.788515822754686e-06, + "loss": 0.77748024, + "num_input_tokens_seen": 98261565, + "step": 4555, + "time_per_iteration": 2.807544708251953 + }, + { + "auxiliary_loss_clip": 0.01117174, + "auxiliary_loss_mlp": 0.01084713, + "balance_loss_clip": 1.02931023, + "balance_loss_mlp": 1.00412726, + "epoch": 0.5478266097516984, + "flos": 19609740408960.0, + "grad_norm": 1.9649304432644867, + "language_loss": 0.78587437, + "learning_rate": 1.7877412344028335e-06, + "loss": 0.80789328, + "num_input_tokens_seen": 98281370, + "step": 4556, + "time_per_iteration": 2.8785202503204346 + }, + { + "auxiliary_loss_clip": 0.01132617, + "auxiliary_loss_mlp": 0.01085948, + "balance_loss_clip": 1.0288856, + "balance_loss_mlp": 1.00550508, + "epoch": 0.5479468526423376, + "flos": 12896599962240.0, + "grad_norm": 2.1846990604539247, + "language_loss": 0.77155077, + "learning_rate": 1.7869666782504668e-06, + "loss": 0.7937364, + "num_input_tokens_seen": 98297950, + "step": 4557, + "time_per_iteration": 2.62422513961792 + }, + { + "auxiliary_loss_clip": 0.01120116, + "auxiliary_loss_mlp": 0.01084952, + "balance_loss_clip": 1.02646375, + "balance_loss_mlp": 1.00465274, + "epoch": 0.5480670955329766, + "flos": 18588800142720.0, + "grad_norm": 1.6689205751677543, + "language_loss": 0.68163025, + "learning_rate": 1.7861921544150867e-06, + "loss": 0.70368087, + "num_input_tokens_seen": 98316800, + "step": 4558, + "time_per_iteration": 2.7272047996520996 + }, + { + "auxiliary_loss_clip": 0.01089627, + "auxiliary_loss_mlp": 0.00873059, + "balance_loss_clip": 1.02425647, + "balance_loss_mlp": 1.00019312, + "epoch": 0.5481873384236157, + "flos": 15954608338560.0, + "grad_norm": 1.7511210163029451, + "language_loss": 0.76529461, + "learning_rate": 1.7854176630141856e-06, + "loss": 0.78492147, + "num_input_tokens_seen": 98333935, + "step": 4559, + "time_per_iteration": 2.832611322402954 + }, + { + "auxiliary_loss_clip": 0.01140525, + "auxiliary_loss_mlp": 0.01085201, + "balance_loss_clip": 1.02849531, + "balance_loss_mlp": 1.00471115, + "epoch": 0.5483075813142548, + "flos": 22783812606720.0, + "grad_norm": 2.067097123399735, + "language_loss": 0.84523195, + "learning_rate": 1.784643204165255e-06, + "loss": 0.86748922, + "num_input_tokens_seen": 98353255, + "step": 4560, + "time_per_iteration": 2.6470723152160645 + }, + { + "auxiliary_loss_clip": 0.01124621, + "auxiliary_loss_mlp": 0.01084096, + "balance_loss_clip": 1.02864146, + "balance_loss_mlp": 1.00365353, + "epoch": 0.5484278242048939, + "flos": 19317212046720.0, + "grad_norm": 4.175033965812387, + "language_loss": 0.77193582, + "learning_rate": 1.7838687779857783e-06, + "loss": 0.79402304, + "num_input_tokens_seen": 98371130, + "step": 4561, + "time_per_iteration": 2.673336982727051 + }, + { + "auxiliary_loss_clip": 0.01120134, + "auxiliary_loss_mlp": 0.0108465, + "balance_loss_clip": 1.02648425, + "balance_loss_mlp": 1.00430286, + "epoch": 0.5485480670955329, + "flos": 22816024128000.0, + "grad_norm": 1.706757866132647, + "language_loss": 0.64000523, + "learning_rate": 1.7830943845932366e-06, + "loss": 0.66205299, + "num_input_tokens_seen": 98390455, + "step": 4562, + "time_per_iteration": 2.7164313793182373 + }, + { + "auxiliary_loss_clip": 0.01122426, + "auxiliary_loss_mlp": 0.01085442, + "balance_loss_clip": 1.02867579, + "balance_loss_mlp": 1.00495219, + "epoch": 0.5486683099861721, + "flos": 22671304231680.0, + "grad_norm": 1.551397216987084, + "language_loss": 0.75313747, + "learning_rate": 1.7823200241051044e-06, + "loss": 0.77521622, + "num_input_tokens_seen": 98409370, + "step": 4563, + "time_per_iteration": 2.7533419132232666 + }, + { + "auxiliary_loss_clip": 0.01139859, + "auxiliary_loss_mlp": 0.01084615, + "balance_loss_clip": 1.02836251, + "balance_loss_mlp": 1.00431585, + "epoch": 0.5487885528768112, + "flos": 23149383275520.0, + "grad_norm": 1.8112037904229767, + "language_loss": 0.8065778, + "learning_rate": 1.7815456966388513e-06, + "loss": 0.82882261, + "num_input_tokens_seen": 98428465, + "step": 4564, + "time_per_iteration": 2.6152424812316895 + }, + { + "auxiliary_loss_clip": 0.01097742, + "auxiliary_loss_mlp": 0.01084811, + "balance_loss_clip": 1.02760005, + "balance_loss_mlp": 1.00441599, + "epoch": 0.5489087957674502, + "flos": 22053928245120.0, + "grad_norm": 1.9965257878441947, + "language_loss": 0.80756938, + "learning_rate": 1.780771402311943e-06, + "loss": 0.82939494, + "num_input_tokens_seen": 98447300, + "step": 4565, + "time_per_iteration": 4.569764852523804 + }, + { + "auxiliary_loss_clip": 0.01116041, + "auxiliary_loss_mlp": 0.01084608, + "balance_loss_clip": 1.02786195, + "balance_loss_mlp": 1.00416541, + "epoch": 0.5490290386580894, + "flos": 24315977191680.0, + "grad_norm": 1.6774449245963539, + "language_loss": 0.78715432, + "learning_rate": 1.7799971412418374e-06, + "loss": 0.80916083, + "num_input_tokens_seen": 98468695, + "step": 4566, + "time_per_iteration": 2.7809135913848877 + }, + { + "auxiliary_loss_clip": 0.01097875, + "auxiliary_loss_mlp": 0.01084431, + "balance_loss_clip": 1.02777898, + "balance_loss_mlp": 1.0038929, + "epoch": 0.5491492815487284, + "flos": 18294942977280.0, + "grad_norm": 1.9623713127335807, + "language_loss": 0.73802388, + "learning_rate": 1.7792229135459918e-06, + "loss": 0.75984693, + "num_input_tokens_seen": 98485345, + "step": 4567, + "time_per_iteration": 2.771528959274292 + }, + { + "auxiliary_loss_clip": 0.01076617, + "auxiliary_loss_mlp": 0.010798, + "balance_loss_clip": 1.01465416, + "balance_loss_mlp": 1.00045383, + "epoch": 0.5492695244393675, + "flos": 64550257050240.0, + "grad_norm": 0.7429673549118018, + "language_loss": 0.61658698, + "learning_rate": 1.7784487193418538e-06, + "loss": 0.63815111, + "num_input_tokens_seen": 98543195, + "step": 4568, + "time_per_iteration": 4.239573955535889 + }, + { + "auxiliary_loss_clip": 0.01111612, + "auxiliary_loss_mlp": 0.01083794, + "balance_loss_clip": 1.02553129, + "balance_loss_mlp": 1.00330377, + "epoch": 0.5493897673300067, + "flos": 17379579761280.0, + "grad_norm": 2.485776300581439, + "language_loss": 0.60471535, + "learning_rate": 1.7776745587468698e-06, + "loss": 0.62666941, + "num_input_tokens_seen": 98560620, + "step": 4569, + "time_per_iteration": 2.7450754642486572 + }, + { + "auxiliary_loss_clip": 0.01139208, + "auxiliary_loss_mlp": 0.01083931, + "balance_loss_clip": 1.02784491, + "balance_loss_mlp": 1.00348818, + "epoch": 0.5495100102206457, + "flos": 19901765980800.0, + "grad_norm": 2.3918213648837297, + "language_loss": 0.81615758, + "learning_rate": 1.7769004318784776e-06, + "loss": 0.83838892, + "num_input_tokens_seen": 98578265, + "step": 4570, + "time_per_iteration": 2.609494686126709 + }, + { + "auxiliary_loss_clip": 0.01129718, + "auxiliary_loss_mlp": 0.01085275, + "balance_loss_clip": 1.02748775, + "balance_loss_mlp": 1.00473726, + "epoch": 0.5496302531112848, + "flos": 16727190992640.0, + "grad_norm": 1.6344080201943867, + "language_loss": 0.80693239, + "learning_rate": 1.776126338854113e-06, + "loss": 0.82908225, + "num_input_tokens_seen": 98596055, + "step": 4571, + "time_per_iteration": 3.6879541873931885 + }, + { + "auxiliary_loss_clip": 0.01124117, + "auxiliary_loss_mlp": 0.01085173, + "balance_loss_clip": 1.02365029, + "balance_loss_mlp": 1.00487375, + "epoch": 0.5497504960019239, + "flos": 24572343536640.0, + "grad_norm": 1.6532746385535575, + "language_loss": 0.84530544, + "learning_rate": 1.7753522797912044e-06, + "loss": 0.86739838, + "num_input_tokens_seen": 98616140, + "step": 4572, + "time_per_iteration": 2.669980049133301 + }, + { + "auxiliary_loss_clip": 0.01123141, + "auxiliary_loss_mlp": 0.01084787, + "balance_loss_clip": 1.02803123, + "balance_loss_mlp": 1.00429654, + "epoch": 0.549870738892563, + "flos": 15450494912640.0, + "grad_norm": 2.2882258427125217, + "language_loss": 0.69724739, + "learning_rate": 1.7745782548071765e-06, + "loss": 0.71932667, + "num_input_tokens_seen": 98633035, + "step": 4573, + "time_per_iteration": 2.7347781658172607 + }, + { + "auxiliary_loss_clip": 0.01103011, + "auxiliary_loss_mlp": 0.01085135, + "balance_loss_clip": 1.02167177, + "balance_loss_mlp": 1.00474, + "epoch": 0.549990981783202, + "flos": 21069114082560.0, + "grad_norm": 1.5604502092338217, + "language_loss": 0.74436557, + "learning_rate": 1.7738042640194482e-06, + "loss": 0.76624709, + "num_input_tokens_seen": 98652700, + "step": 4574, + "time_per_iteration": 2.74709415435791 + }, + { + "auxiliary_loss_clip": 0.01139393, + "auxiliary_loss_mlp": 0.01084525, + "balance_loss_clip": 1.02856779, + "balance_loss_mlp": 1.0040828, + "epoch": 0.5501112246738411, + "flos": 21395901041280.0, + "grad_norm": 1.712951657181795, + "language_loss": 0.70424891, + "learning_rate": 1.7730303075454335e-06, + "loss": 0.72648805, + "num_input_tokens_seen": 98671590, + "step": 4575, + "time_per_iteration": 2.6115469932556152 + }, + { + "auxiliary_loss_clip": 0.01112704, + "auxiliary_loss_mlp": 0.01086415, + "balance_loss_clip": 1.02725279, + "balance_loss_mlp": 1.00592446, + "epoch": 0.5502314675644803, + "flos": 17456931699840.0, + "grad_norm": 2.044821827398908, + "language_loss": 0.84930766, + "learning_rate": 1.7722563855025402e-06, + "loss": 0.87129891, + "num_input_tokens_seen": 98689620, + "step": 4576, + "time_per_iteration": 2.711460828781128 + }, + { + "auxiliary_loss_clip": 0.01123015, + "auxiliary_loss_mlp": 0.01084118, + "balance_loss_clip": 1.02773523, + "balance_loss_mlp": 1.0036757, + "epoch": 0.5503517104551193, + "flos": 24310410583680.0, + "grad_norm": 5.689522593271219, + "language_loss": 0.71164334, + "learning_rate": 1.7714824980081721e-06, + "loss": 0.7337147, + "num_input_tokens_seen": 98708915, + "step": 4577, + "time_per_iteration": 2.7170374393463135 + }, + { + "auxiliary_loss_clip": 0.01130475, + "auxiliary_loss_mlp": 0.0108453, + "balance_loss_clip": 1.02895594, + "balance_loss_mlp": 1.00418246, + "epoch": 0.5504719533457584, + "flos": 22419427086720.0, + "grad_norm": 1.9596652353989452, + "language_loss": 0.73887908, + "learning_rate": 1.7707086451797276e-06, + "loss": 0.76102912, + "num_input_tokens_seen": 98729790, + "step": 4578, + "time_per_iteration": 2.7499208450317383 + }, + { + "auxiliary_loss_clip": 0.01094627, + "auxiliary_loss_mlp": 0.01078988, + "balance_loss_clip": 1.02250743, + "balance_loss_mlp": 1.00002384, + "epoch": 0.5505921962363975, + "flos": 67294155968640.0, + "grad_norm": 0.7002585178927155, + "language_loss": 0.52381748, + "learning_rate": 1.7699348271345993e-06, + "loss": 0.54555362, + "num_input_tokens_seen": 98792415, + "step": 4579, + "time_per_iteration": 3.258075714111328 + }, + { + "auxiliary_loss_clip": 0.01087935, + "auxiliary_loss_mlp": 0.01079047, + "balance_loss_clip": 1.0235188, + "balance_loss_mlp": 1.00008249, + "epoch": 0.5507124391270366, + "flos": 45685125578880.0, + "grad_norm": 0.7137539263090594, + "language_loss": 0.54456955, + "learning_rate": 1.7691610439901753e-06, + "loss": 0.56623936, + "num_input_tokens_seen": 98855350, + "step": 4580, + "time_per_iteration": 3.397026777267456 + }, + { + "auxiliary_loss_clip": 0.01130401, + "auxiliary_loss_mlp": 0.0108559, + "balance_loss_clip": 1.02717435, + "balance_loss_mlp": 1.00519502, + "epoch": 0.5508326820176757, + "flos": 22273845264000.0, + "grad_norm": 1.6915882799237998, + "language_loss": 0.75470245, + "learning_rate": 1.7683872958638367e-06, + "loss": 0.77686226, + "num_input_tokens_seen": 98874230, + "step": 4581, + "time_per_iteration": 2.687727212905884 + }, + { + "auxiliary_loss_clip": 0.01120284, + "auxiliary_loss_mlp": 0.01084524, + "balance_loss_clip": 1.02594304, + "balance_loss_mlp": 1.00412917, + "epoch": 0.5509529249083148, + "flos": 20012442762240.0, + "grad_norm": 2.053787567487377, + "language_loss": 0.84284365, + "learning_rate": 1.7676135828729614e-06, + "loss": 0.86489177, + "num_input_tokens_seen": 98893940, + "step": 4582, + "time_per_iteration": 2.8026089668273926 + }, + { + "auxiliary_loss_clip": 0.0113072, + "auxiliary_loss_mlp": 0.01084719, + "balance_loss_clip": 1.02847981, + "balance_loss_mlp": 1.00427663, + "epoch": 0.5510731677989539, + "flos": 21834801325440.0, + "grad_norm": 2.3846540887784733, + "language_loss": 0.82917738, + "learning_rate": 1.7668399051349205e-06, + "loss": 0.85133183, + "num_input_tokens_seen": 98913620, + "step": 4583, + "time_per_iteration": 2.6760857105255127 + }, + { + "auxiliary_loss_clip": 0.01105283, + "auxiliary_loss_mlp": 0.01084873, + "balance_loss_clip": 1.02524471, + "balance_loss_mlp": 1.00438249, + "epoch": 0.5511934106895929, + "flos": 21467901853440.0, + "grad_norm": 1.9383928749027093, + "language_loss": 0.83247548, + "learning_rate": 1.766066262767081e-06, + "loss": 0.85437703, + "num_input_tokens_seen": 98931460, + "step": 4584, + "time_per_iteration": 2.7538161277770996 + }, + { + "auxiliary_loss_clip": 0.01119561, + "auxiliary_loss_mlp": 0.01084529, + "balance_loss_clip": 1.02724099, + "balance_loss_mlp": 1.00418162, + "epoch": 0.5513136535802321, + "flos": 21068934514560.0, + "grad_norm": 1.9614788559795646, + "language_loss": 0.77362657, + "learning_rate": 1.765292655886803e-06, + "loss": 0.79566741, + "num_input_tokens_seen": 98950105, + "step": 4585, + "time_per_iteration": 2.723099708557129 + }, + { + "auxiliary_loss_clip": 0.0111327, + "auxiliary_loss_mlp": 0.0108486, + "balance_loss_clip": 1.02680099, + "balance_loss_mlp": 1.0044657, + "epoch": 0.5514338964708712, + "flos": 27815004754560.0, + "grad_norm": 1.8044482836434819, + "language_loss": 0.70288092, + "learning_rate": 1.764519084611443e-06, + "loss": 0.72486222, + "num_input_tokens_seen": 98970560, + "step": 4586, + "time_per_iteration": 2.7765772342681885 + }, + { + "auxiliary_loss_clip": 0.01120079, + "auxiliary_loss_mlp": 0.0108528, + "balance_loss_clip": 1.02644444, + "balance_loss_mlp": 1.0047425, + "epoch": 0.5515541393615102, + "flos": 21908525990400.0, + "grad_norm": 1.9287010698371971, + "language_loss": 0.77860808, + "learning_rate": 1.7637455490583505e-06, + "loss": 0.80066168, + "num_input_tokens_seen": 98989885, + "step": 4587, + "time_per_iteration": 2.8352019786834717 + }, + { + "auxiliary_loss_clip": 0.01129851, + "auxiliary_loss_mlp": 0.01086155, + "balance_loss_clip": 1.02738094, + "balance_loss_mlp": 1.00571263, + "epoch": 0.5516743822521494, + "flos": 20485422074880.0, + "grad_norm": 1.9067758271978914, + "language_loss": 0.77359855, + "learning_rate": 1.7629720493448701e-06, + "loss": 0.79575866, + "num_input_tokens_seen": 99007180, + "step": 4588, + "time_per_iteration": 2.675879716873169 + }, + { + "auxiliary_loss_clip": 0.01122656, + "auxiliary_loss_mlp": 0.01085475, + "balance_loss_clip": 1.02759504, + "balance_loss_mlp": 1.00503254, + "epoch": 0.5517946251427884, + "flos": 14940383915520.0, + "grad_norm": 1.5904077698128145, + "language_loss": 0.85090315, + "learning_rate": 1.7621985855883418e-06, + "loss": 0.87298441, + "num_input_tokens_seen": 99023880, + "step": 4589, + "time_per_iteration": 2.7589523792266846 + }, + { + "auxiliary_loss_clip": 0.01119201, + "auxiliary_loss_mlp": 0.01084598, + "balance_loss_clip": 1.02671194, + "balance_loss_mlp": 1.00415516, + "epoch": 0.5519148680334275, + "flos": 18404865573120.0, + "grad_norm": 1.6669898309690823, + "language_loss": 0.72396088, + "learning_rate": 1.7614251579060983e-06, + "loss": 0.74599886, + "num_input_tokens_seen": 99042475, + "step": 4590, + "time_per_iteration": 3.4919066429138184 + }, + { + "auxiliary_loss_clip": 0.01095431, + "auxiliary_loss_mlp": 0.01086937, + "balance_loss_clip": 1.02696192, + "balance_loss_mlp": 1.00644636, + "epoch": 0.5520351109240667, + "flos": 25113337251840.0, + "grad_norm": 1.5930702123706024, + "language_loss": 0.84799331, + "learning_rate": 1.76065176641547e-06, + "loss": 0.86981702, + "num_input_tokens_seen": 99065185, + "step": 4591, + "time_per_iteration": 3.7252697944641113 + }, + { + "auxiliary_loss_clip": 0.01130899, + "auxiliary_loss_mlp": 0.01086079, + "balance_loss_clip": 1.02766311, + "balance_loss_mlp": 1.0057323, + "epoch": 0.5521553538147057, + "flos": 21069545045760.0, + "grad_norm": 1.657368109521344, + "language_loss": 0.77765775, + "learning_rate": 1.759878411233777e-06, + "loss": 0.79982758, + "num_input_tokens_seen": 99083645, + "step": 4592, + "time_per_iteration": 2.677614450454712 + }, + { + "auxiliary_loss_clip": 0.01129863, + "auxiliary_loss_mlp": 0.01086614, + "balance_loss_clip": 1.02842236, + "balance_loss_mlp": 1.0061717, + "epoch": 0.5522755967053448, + "flos": 18879999701760.0, + "grad_norm": 2.116608994644038, + "language_loss": 0.75696236, + "learning_rate": 1.7591050924783388e-06, + "loss": 0.77912712, + "num_input_tokens_seen": 99100835, + "step": 4593, + "time_per_iteration": 2.6033945083618164 + }, + { + "auxiliary_loss_clip": 0.01085529, + "auxiliary_loss_mlp": 0.01079454, + "balance_loss_clip": 1.02266538, + "balance_loss_mlp": 1.0001086, + "epoch": 0.5523958395959839, + "flos": 64675622494080.0, + "grad_norm": 0.8378602797542396, + "language_loss": 0.57951462, + "learning_rate": 1.7583318102664661e-06, + "loss": 0.60116446, + "num_input_tokens_seen": 99168400, + "step": 4594, + "time_per_iteration": 4.3516480922698975 + }, + { + "auxiliary_loss_clip": 0.01129746, + "auxiliary_loss_mlp": 0.01083961, + "balance_loss_clip": 1.02620888, + "balance_loss_mlp": 1.00351834, + "epoch": 0.552516082486623, + "flos": 10889732211840.0, + "grad_norm": 1.708299828916459, + "language_loss": 0.79082292, + "learning_rate": 1.757558564715466e-06, + "loss": 0.81295997, + "num_input_tokens_seen": 99186475, + "step": 4595, + "time_per_iteration": 2.6696414947509766 + }, + { + "auxiliary_loss_clip": 0.01128939, + "auxiliary_loss_mlp": 0.01084013, + "balance_loss_clip": 1.02717853, + "balance_loss_mlp": 1.00357091, + "epoch": 0.552636325377262, + "flos": 22199797376640.0, + "grad_norm": 2.3119494117623702, + "language_loss": 0.73277938, + "learning_rate": 1.7567853559426386e-06, + "loss": 0.75490892, + "num_input_tokens_seen": 99203525, + "step": 4596, + "time_per_iteration": 2.7492189407348633 + }, + { + "auxiliary_loss_clip": 0.01130938, + "auxiliary_loss_mlp": 0.01084093, + "balance_loss_clip": 1.02772415, + "balance_loss_mlp": 1.00374603, + "epoch": 0.5527565682679012, + "flos": 23988184652160.0, + "grad_norm": 1.936855669614235, + "language_loss": 0.75020993, + "learning_rate": 1.7560121840652797e-06, + "loss": 0.77236032, + "num_input_tokens_seen": 99222910, + "step": 4597, + "time_per_iteration": 3.5495288372039795 + }, + { + "auxiliary_loss_clip": 0.01110683, + "auxiliary_loss_mlp": 0.01085252, + "balance_loss_clip": 1.02638388, + "balance_loss_mlp": 1.00471413, + "epoch": 0.5528768111585403, + "flos": 19719267955200.0, + "grad_norm": 1.7596421374882494, + "language_loss": 0.69285816, + "learning_rate": 1.7552390492006782e-06, + "loss": 0.71481752, + "num_input_tokens_seen": 99241230, + "step": 4598, + "time_per_iteration": 2.71811842918396 + }, + { + "auxiliary_loss_clip": 0.01104195, + "auxiliary_loss_mlp": 0.00873164, + "balance_loss_clip": 1.02662849, + "balance_loss_mlp": 1.00019121, + "epoch": 0.5529970540491793, + "flos": 26215975002240.0, + "grad_norm": 1.700154910649734, + "language_loss": 0.64889324, + "learning_rate": 1.7544659514661184e-06, + "loss": 0.66866684, + "num_input_tokens_seen": 99264320, + "step": 4599, + "time_per_iteration": 2.8093435764312744 + }, + { + "auxiliary_loss_clip": 0.01124623, + "auxiliary_loss_mlp": 0.01084018, + "balance_loss_clip": 1.02894139, + "balance_loss_mlp": 1.00362301, + "epoch": 0.5531172969398185, + "flos": 24425971614720.0, + "grad_norm": 1.8323088394293305, + "language_loss": 0.79728019, + "learning_rate": 1.7536928909788786e-06, + "loss": 0.81936657, + "num_input_tokens_seen": 99283625, + "step": 4600, + "time_per_iteration": 2.8090708255767822 + }, + { + "auxiliary_loss_clip": 0.01071525, + "auxiliary_loss_mlp": 0.01079007, + "balance_loss_clip": 1.02426636, + "balance_loss_mlp": 1.00004315, + "epoch": 0.5532375398304575, + "flos": 64907316195840.0, + "grad_norm": 0.8756460161295103, + "language_loss": 0.62030506, + "learning_rate": 1.752919867856231e-06, + "loss": 0.64181042, + "num_input_tokens_seen": 99335270, + "step": 4601, + "time_per_iteration": 3.1625006198883057 + }, + { + "auxiliary_loss_clip": 0.01121187, + "auxiliary_loss_mlp": 0.01084304, + "balance_loss_clip": 1.02691531, + "balance_loss_mlp": 1.00400496, + "epoch": 0.5533577827210966, + "flos": 19683105937920.0, + "grad_norm": 1.5014518971125503, + "language_loss": 0.78734499, + "learning_rate": 1.7521468822154436e-06, + "loss": 0.80939984, + "num_input_tokens_seen": 99354185, + "step": 4602, + "time_per_iteration": 2.792121648788452 + }, + { + "auxiliary_loss_clip": 0.01120041, + "auxiliary_loss_mlp": 0.01084785, + "balance_loss_clip": 1.02762735, + "balance_loss_mlp": 1.00439048, + "epoch": 0.5534780256117358, + "flos": 32306496076800.0, + "grad_norm": 1.8023738424993967, + "language_loss": 0.75682819, + "learning_rate": 1.751373934173777e-06, + "loss": 0.77887648, + "num_input_tokens_seen": 99376930, + "step": 4603, + "time_per_iteration": 2.779287099838257 + }, + { + "auxiliary_loss_clip": 0.01140016, + "auxiliary_loss_mlp": 0.01085374, + "balance_loss_clip": 1.0281899, + "balance_loss_mlp": 1.00497937, + "epoch": 0.5535982685023748, + "flos": 23222425582080.0, + "grad_norm": 1.6656413572467055, + "language_loss": 0.72705609, + "learning_rate": 1.750601023848487e-06, + "loss": 0.7493099, + "num_input_tokens_seen": 99397655, + "step": 4604, + "time_per_iteration": 2.693486213684082 + }, + { + "auxiliary_loss_clip": 0.01139659, + "auxiliary_loss_mlp": 0.00873092, + "balance_loss_clip": 1.02876163, + "balance_loss_mlp": 1.00023127, + "epoch": 0.5537185113930139, + "flos": 24352534258560.0, + "grad_norm": 2.7860441237407327, + "language_loss": 0.73820215, + "learning_rate": 1.749828151356823e-06, + "loss": 0.75832963, + "num_input_tokens_seen": 99417850, + "step": 4605, + "time_per_iteration": 2.674217462539673 + }, + { + "auxiliary_loss_clip": 0.01119952, + "auxiliary_loss_mlp": 0.01083215, + "balance_loss_clip": 1.02660918, + "balance_loss_mlp": 1.00291538, + "epoch": 0.553838754283653, + "flos": 23549068886400.0, + "grad_norm": 1.878424114352004, + "language_loss": 0.75423336, + "learning_rate": 1.7490553168160297e-06, + "loss": 0.77626503, + "num_input_tokens_seen": 99438920, + "step": 4606, + "time_per_iteration": 2.7136945724487305 + }, + { + "auxiliary_loss_clip": 0.01114607, + "auxiliary_loss_mlp": 0.01083718, + "balance_loss_clip": 1.02592444, + "balance_loss_mlp": 1.00337112, + "epoch": 0.5539589971742921, + "flos": 17275044205440.0, + "grad_norm": 1.990477551360439, + "language_loss": 0.76347578, + "learning_rate": 1.748282520343345e-06, + "loss": 0.78545898, + "num_input_tokens_seen": 99457950, + "step": 4607, + "time_per_iteration": 2.7148988246917725 + }, + { + "auxiliary_loss_clip": 0.01132408, + "auxiliary_loss_mlp": 0.01085896, + "balance_loss_clip": 1.02865362, + "balance_loss_mlp": 1.00545323, + "epoch": 0.5540792400649311, + "flos": 27564169104000.0, + "grad_norm": 1.8297798053578687, + "language_loss": 0.78402215, + "learning_rate": 1.7475097620560023e-06, + "loss": 0.80620521, + "num_input_tokens_seen": 99478015, + "step": 4608, + "time_per_iteration": 2.737668514251709 + }, + { + "auxiliary_loss_clip": 0.01139589, + "auxiliary_loss_mlp": 0.01083371, + "balance_loss_clip": 1.02797151, + "balance_loss_mlp": 1.00307167, + "epoch": 0.5541994829555702, + "flos": 23878657105920.0, + "grad_norm": 1.6266540826991838, + "language_loss": 0.70767725, + "learning_rate": 1.746737042071228e-06, + "loss": 0.72990686, + "num_input_tokens_seen": 99496520, + "step": 4609, + "time_per_iteration": 2.7134957313537598 + }, + { + "auxiliary_loss_clip": 0.01117409, + "auxiliary_loss_mlp": 0.0108381, + "balance_loss_clip": 1.02623832, + "balance_loss_mlp": 1.00351119, + "epoch": 0.5543197258462094, + "flos": 20115721342080.0, + "grad_norm": 1.7784692292108988, + "language_loss": 0.7917527, + "learning_rate": 1.7459643605062424e-06, + "loss": 0.81376487, + "num_input_tokens_seen": 99513780, + "step": 4610, + "time_per_iteration": 2.619183301925659 + }, + { + "auxiliary_loss_clip": 0.01098413, + "auxiliary_loss_mlp": 0.01085134, + "balance_loss_clip": 1.02340484, + "balance_loss_mlp": 1.00473964, + "epoch": 0.5544399687368484, + "flos": 20916565021440.0, + "grad_norm": 1.587584251189334, + "language_loss": 0.80472493, + "learning_rate": 1.745191717478262e-06, + "loss": 0.82656038, + "num_input_tokens_seen": 99532360, + "step": 4611, + "time_per_iteration": 2.815659284591675 + }, + { + "auxiliary_loss_clip": 0.01113824, + "auxiliary_loss_mlp": 0.01085102, + "balance_loss_clip": 1.02208042, + "balance_loss_mlp": 1.00475478, + "epoch": 0.5545602116274875, + "flos": 25518661297920.0, + "grad_norm": 1.7043743747216387, + "language_loss": 0.79375648, + "learning_rate": 1.7444191131044948e-06, + "loss": 0.81574571, + "num_input_tokens_seen": 99552635, + "step": 4612, + "time_per_iteration": 2.7318313121795654 + }, + { + "auxiliary_loss_clip": 0.01117753, + "auxiliary_loss_mlp": 0.01085367, + "balance_loss_clip": 1.02544248, + "balance_loss_mlp": 1.00497222, + "epoch": 0.5546804545181266, + "flos": 20995568985600.0, + "grad_norm": 1.8184236407868712, + "language_loss": 0.73010874, + "learning_rate": 1.7436465475021456e-06, + "loss": 0.75213999, + "num_input_tokens_seen": 99572685, + "step": 4613, + "time_per_iteration": 2.738595485687256 + }, + { + "auxiliary_loss_clip": 0.01109763, + "auxiliary_loss_mlp": 0.01084368, + "balance_loss_clip": 1.02543902, + "balance_loss_mlp": 1.00397384, + "epoch": 0.5548006974087657, + "flos": 26833638297600.0, + "grad_norm": 4.725297266111796, + "language_loss": 0.71582317, + "learning_rate": 1.7428740207884111e-06, + "loss": 0.7377646, + "num_input_tokens_seen": 99593565, + "step": 4614, + "time_per_iteration": 2.7718122005462646 + }, + { + "auxiliary_loss_clip": 0.01075999, + "auxiliary_loss_mlp": 0.01084145, + "balance_loss_clip": 1.02307129, + "balance_loss_mlp": 1.00379801, + "epoch": 0.5549209402994048, + "flos": 33656414031360.0, + "grad_norm": 3.4254605978392596, + "language_loss": 0.60938996, + "learning_rate": 1.7421015330804833e-06, + "loss": 0.63099146, + "num_input_tokens_seen": 99613485, + "step": 4615, + "time_per_iteration": 2.9104771614074707 + }, + { + "auxiliary_loss_clip": 0.01140038, + "auxiliary_loss_mlp": 0.01084195, + "balance_loss_clip": 1.02840972, + "balance_loss_mlp": 1.00375271, + "epoch": 0.5550411831900439, + "flos": 23769524609280.0, + "grad_norm": 2.0672418705676456, + "language_loss": 0.72680283, + "learning_rate": 1.7413290844955475e-06, + "loss": 0.74904513, + "num_input_tokens_seen": 99633515, + "step": 4616, + "time_per_iteration": 4.499298810958862 + }, + { + "auxiliary_loss_clip": 0.01131548, + "auxiliary_loss_mlp": 0.01085258, + "balance_loss_clip": 1.02869225, + "balance_loss_mlp": 1.00491047, + "epoch": 0.555161426080683, + "flos": 21651189978240.0, + "grad_norm": 1.8599255938980346, + "language_loss": 0.78507614, + "learning_rate": 1.7405566751507843e-06, + "loss": 0.80724424, + "num_input_tokens_seen": 99651560, + "step": 4617, + "time_per_iteration": 2.70222806930542 + }, + { + "auxiliary_loss_clip": 0.01110799, + "auxiliary_loss_mlp": 0.01084251, + "balance_loss_clip": 1.0265696, + "balance_loss_mlp": 1.00395203, + "epoch": 0.555281668971322, + "flos": 49563116605440.0, + "grad_norm": 1.7903760540784048, + "language_loss": 0.6776253, + "learning_rate": 1.7397843051633668e-06, + "loss": 0.69957578, + "num_input_tokens_seen": 99674255, + "step": 4618, + "time_per_iteration": 3.033601760864258 + }, + { + "auxiliary_loss_clip": 0.01122198, + "auxiliary_loss_mlp": 0.01085236, + "balance_loss_clip": 1.02614069, + "balance_loss_mlp": 1.00484157, + "epoch": 0.5554019118619612, + "flos": 20741608851840.0, + "grad_norm": 1.6542596407148573, + "language_loss": 0.71272719, + "learning_rate": 1.739011974650464e-06, + "loss": 0.73480153, + "num_input_tokens_seen": 99693585, + "step": 4619, + "time_per_iteration": 2.7691314220428467 + }, + { + "auxiliary_loss_clip": 0.0110058, + "auxiliary_loss_mlp": 0.01084679, + "balance_loss_clip": 1.02400649, + "balance_loss_mlp": 1.00418901, + "epoch": 0.5555221547526003, + "flos": 25483217552640.0, + "grad_norm": 2.354924540792411, + "language_loss": 0.76427257, + "learning_rate": 1.7382396837292365e-06, + "loss": 0.78612518, + "num_input_tokens_seen": 99714045, + "step": 4620, + "time_per_iteration": 3.783646821975708 + }, + { + "auxiliary_loss_clip": 0.011403, + "auxiliary_loss_mlp": 0.01084656, + "balance_loss_clip": 1.02916491, + "balance_loss_mlp": 1.00430882, + "epoch": 0.5556423976432393, + "flos": 21762513204480.0, + "grad_norm": 1.8376803968937323, + "language_loss": 0.73226118, + "learning_rate": 1.737467432516841e-06, + "loss": 0.7545107, + "num_input_tokens_seen": 99734145, + "step": 4621, + "time_per_iteration": 2.658825635910034 + }, + { + "auxiliary_loss_clip": 0.01123717, + "auxiliary_loss_mlp": 0.01085853, + "balance_loss_clip": 1.02839971, + "balance_loss_mlp": 1.00541043, + "epoch": 0.5557626405338785, + "flos": 24900171989760.0, + "grad_norm": 6.682896409422879, + "language_loss": 0.7429117, + "learning_rate": 1.7366952211304274e-06, + "loss": 0.76500738, + "num_input_tokens_seen": 99751990, + "step": 4622, + "time_per_iteration": 3.7221288681030273 + }, + { + "auxiliary_loss_clip": 0.01120114, + "auxiliary_loss_mlp": 0.01085045, + "balance_loss_clip": 1.02706957, + "balance_loss_mlp": 1.00465047, + "epoch": 0.5558828834245175, + "flos": 18697501676160.0, + "grad_norm": 2.0467835172781528, + "language_loss": 0.83311713, + "learning_rate": 1.735923049687139e-06, + "loss": 0.8551687, + "num_input_tokens_seen": 99768565, + "step": 4623, + "time_per_iteration": 2.6499030590057373 + }, + { + "auxiliary_loss_clip": 0.01122315, + "auxiliary_loss_mlp": 0.01084825, + "balance_loss_clip": 1.02719629, + "balance_loss_mlp": 1.00452542, + "epoch": 0.5560031263151566, + "flos": 27272179445760.0, + "grad_norm": 1.6397983250426267, + "language_loss": 0.73851335, + "learning_rate": 1.7351509183041144e-06, + "loss": 0.76058477, + "num_input_tokens_seen": 99788895, + "step": 4624, + "time_per_iteration": 2.7691431045532227 + }, + { + "auxiliary_loss_clip": 0.01141042, + "auxiliary_loss_mlp": 0.01084824, + "balance_loss_clip": 1.02956939, + "balance_loss_mlp": 1.00442958, + "epoch": 0.5561233692057957, + "flos": 23403738458880.0, + "grad_norm": 1.687265690430603, + "language_loss": 0.71605587, + "learning_rate": 1.7343788270984852e-06, + "loss": 0.73831457, + "num_input_tokens_seen": 99808035, + "step": 4625, + "time_per_iteration": 2.6136021614074707 + }, + { + "auxiliary_loss_clip": 0.01113842, + "auxiliary_loss_mlp": 0.01085318, + "balance_loss_clip": 1.0265907, + "balance_loss_mlp": 1.00487602, + "epoch": 0.5562436120964348, + "flos": 37670867804160.0, + "grad_norm": 2.0921745261393205, + "language_loss": 0.74851155, + "learning_rate": 1.7336067761873764e-06, + "loss": 0.77050316, + "num_input_tokens_seen": 99830460, + "step": 4626, + "time_per_iteration": 2.8713343143463135 + }, + { + "auxiliary_loss_clip": 0.0113066, + "auxiliary_loss_mlp": 0.01085657, + "balance_loss_clip": 1.02704489, + "balance_loss_mlp": 1.00511885, + "epoch": 0.5563638549870739, + "flos": 25155245445120.0, + "grad_norm": 1.7582864956632076, + "language_loss": 0.76105368, + "learning_rate": 1.7328347656879076e-06, + "loss": 0.78321689, + "num_input_tokens_seen": 99850320, + "step": 4627, + "time_per_iteration": 2.663196086883545 + }, + { + "auxiliary_loss_clip": 0.01114491, + "auxiliary_loss_mlp": 0.01085363, + "balance_loss_clip": 1.02742505, + "balance_loss_mlp": 1.0049206, + "epoch": 0.556484097877713, + "flos": 13581810783360.0, + "grad_norm": 2.1675932843208043, + "language_loss": 0.68761659, + "learning_rate": 1.7320627957171927e-06, + "loss": 0.70961511, + "num_input_tokens_seen": 99864980, + "step": 4628, + "time_per_iteration": 2.7581441402435303 + }, + { + "auxiliary_loss_clip": 0.0113984, + "auxiliary_loss_mlp": 0.01085923, + "balance_loss_clip": 1.02891755, + "balance_loss_mlp": 1.00538564, + "epoch": 0.5566043407683521, + "flos": 24681368292480.0, + "grad_norm": 1.9877279780161283, + "language_loss": 0.81376243, + "learning_rate": 1.7312908663923382e-06, + "loss": 0.83601999, + "num_input_tokens_seen": 99881155, + "step": 4629, + "time_per_iteration": 2.648115396499634 + }, + { + "auxiliary_loss_clip": 0.01130386, + "auxiliary_loss_mlp": 0.01084726, + "balance_loss_clip": 1.02735078, + "balance_loss_mlp": 1.00428319, + "epoch": 0.5567245836589911, + "flos": 20588161950720.0, + "grad_norm": 2.1511205107092204, + "language_loss": 0.67734659, + "learning_rate": 1.7305189778304463e-06, + "loss": 0.69949776, + "num_input_tokens_seen": 99899330, + "step": 4630, + "time_per_iteration": 2.6487302780151367 + }, + { + "auxiliary_loss_clip": 0.01122257, + "auxiliary_loss_mlp": 0.0108592, + "balance_loss_clip": 1.02869141, + "balance_loss_mlp": 1.00562036, + "epoch": 0.5568448265496303, + "flos": 20704189858560.0, + "grad_norm": 1.758648691947155, + "language_loss": 0.80021137, + "learning_rate": 1.729747130148611e-06, + "loss": 0.82229316, + "num_input_tokens_seen": 99918525, + "step": 4631, + "time_per_iteration": 2.746518611907959 + }, + { + "auxiliary_loss_clip": 0.01111607, + "auxiliary_loss_mlp": 0.01085369, + "balance_loss_clip": 1.02657986, + "balance_loss_mlp": 1.00483155, + "epoch": 0.5569650694402694, + "flos": 25302910256640.0, + "grad_norm": 1.745382517365711, + "language_loss": 0.76885045, + "learning_rate": 1.7289753234639208e-06, + "loss": 0.79082024, + "num_input_tokens_seen": 99937500, + "step": 4632, + "time_per_iteration": 2.765141487121582 + }, + { + "auxiliary_loss_clip": 0.01129817, + "auxiliary_loss_mlp": 0.01085585, + "balance_loss_clip": 1.02685952, + "balance_loss_mlp": 1.00519073, + "epoch": 0.5570853123309084, + "flos": 19712623939200.0, + "grad_norm": 1.7465797550788464, + "language_loss": 0.76502264, + "learning_rate": 1.7282035578934592e-06, + "loss": 0.78717667, + "num_input_tokens_seen": 99955665, + "step": 4633, + "time_per_iteration": 2.751230001449585 + }, + { + "auxiliary_loss_clip": 0.01113449, + "auxiliary_loss_mlp": 0.01084617, + "balance_loss_clip": 1.02669811, + "balance_loss_mlp": 1.00431788, + "epoch": 0.5572055552215476, + "flos": 16108091153280.0, + "grad_norm": 2.1424892489501173, + "language_loss": 0.78945249, + "learning_rate": 1.727431833554301e-06, + "loss": 0.8114332, + "num_input_tokens_seen": 99974140, + "step": 4634, + "time_per_iteration": 2.6617915630340576 + }, + { + "auxiliary_loss_clip": 0.01079094, + "auxiliary_loss_mlp": 0.01084291, + "balance_loss_clip": 1.02523184, + "balance_loss_mlp": 1.00380135, + "epoch": 0.5573257981121866, + "flos": 17128815937920.0, + "grad_norm": 1.8916390530983618, + "language_loss": 0.77016956, + "learning_rate": 1.7266601505635175e-06, + "loss": 0.79180336, + "num_input_tokens_seen": 99991480, + "step": 4635, + "time_per_iteration": 2.808370351791382 + }, + { + "auxiliary_loss_clip": 0.01129753, + "auxiliary_loss_mlp": 0.01083876, + "balance_loss_clip": 1.0280751, + "balance_loss_mlp": 1.00343323, + "epoch": 0.5574460410028257, + "flos": 18807029222400.0, + "grad_norm": 1.9373369609389417, + "language_loss": 0.75904208, + "learning_rate": 1.7258885090381717e-06, + "loss": 0.78117836, + "num_input_tokens_seen": 100009520, + "step": 4636, + "time_per_iteration": 2.6063613891601562 + }, + { + "auxiliary_loss_clip": 0.01120702, + "auxiliary_loss_mlp": 0.01086221, + "balance_loss_clip": 1.02628112, + "balance_loss_mlp": 1.00582576, + "epoch": 0.5575662838934649, + "flos": 29642678530560.0, + "grad_norm": 1.7906245132961436, + "language_loss": 0.78479302, + "learning_rate": 1.7251169090953213e-06, + "loss": 0.80686224, + "num_input_tokens_seen": 100029995, + "step": 4637, + "time_per_iteration": 2.802755355834961 + }, + { + "auxiliary_loss_clip": 0.01128906, + "auxiliary_loss_mlp": 0.0108623, + "balance_loss_clip": 1.02718091, + "balance_loss_mlp": 1.00573969, + "epoch": 0.5576865267841039, + "flos": 22054466949120.0, + "grad_norm": 2.6167599568217907, + "language_loss": 0.76817608, + "learning_rate": 1.7243453508520168e-06, + "loss": 0.79032749, + "num_input_tokens_seen": 100046980, + "step": 4638, + "time_per_iteration": 2.7161359786987305 + }, + { + "auxiliary_loss_clip": 0.01114058, + "auxiliary_loss_mlp": 0.0108547, + "balance_loss_clip": 1.02521324, + "balance_loss_mlp": 1.00488496, + "epoch": 0.557806769674743, + "flos": 17196040241280.0, + "grad_norm": 9.246517687037423, + "language_loss": 0.84342796, + "learning_rate": 1.7235738344253038e-06, + "loss": 0.8654232, + "num_input_tokens_seen": 100060610, + "step": 4639, + "time_per_iteration": 2.7003910541534424 + }, + { + "auxiliary_loss_clip": 0.01123516, + "auxiliary_loss_mlp": 0.01084162, + "balance_loss_clip": 1.0232935, + "balance_loss_mlp": 1.00362408, + "epoch": 0.557927012565382, + "flos": 24712717887360.0, + "grad_norm": 1.7601193303408433, + "language_loss": 0.82435173, + "learning_rate": 1.72280235993222e-06, + "loss": 0.84642845, + "num_input_tokens_seen": 100078915, + "step": 4640, + "time_per_iteration": 2.687518835067749 + }, + { + "auxiliary_loss_clip": 0.01121221, + "auxiliary_loss_mlp": 0.00873085, + "balance_loss_clip": 1.02547646, + "balance_loss_mlp": 1.00011742, + "epoch": 0.5580472554560212, + "flos": 16983090460800.0, + "grad_norm": 4.774253927160394, + "language_loss": 0.69248962, + "learning_rate": 1.722030927489798e-06, + "loss": 0.71243268, + "num_input_tokens_seen": 100096195, + "step": 4641, + "time_per_iteration": 3.602811336517334 + }, + { + "auxiliary_loss_clip": 0.0111015, + "auxiliary_loss_mlp": 0.01085121, + "balance_loss_clip": 1.02594769, + "balance_loss_mlp": 1.00472665, + "epoch": 0.5581674983466602, + "flos": 23509100027520.0, + "grad_norm": 1.6472985059147802, + "language_loss": 0.74129295, + "learning_rate": 1.7212595372150634e-06, + "loss": 0.76324564, + "num_input_tokens_seen": 100116175, + "step": 4642, + "time_per_iteration": 3.659663677215576 + }, + { + "auxiliary_loss_clip": 0.01139913, + "auxiliary_loss_mlp": 0.01084629, + "balance_loss_clip": 1.02906811, + "balance_loss_mlp": 1.00423431, + "epoch": 0.5582877412372993, + "flos": 13480291969920.0, + "grad_norm": 3.076454457682232, + "language_loss": 0.72570717, + "learning_rate": 1.720488189225035e-06, + "loss": 0.74795258, + "num_input_tokens_seen": 100133875, + "step": 4643, + "time_per_iteration": 2.659078359603882 + }, + { + "auxiliary_loss_clip": 0.01129493, + "auxiliary_loss_mlp": 0.01084193, + "balance_loss_clip": 1.02674389, + "balance_loss_mlp": 1.0037508, + "epoch": 0.5584079841279385, + "flos": 21903605827200.0, + "grad_norm": 2.8125126423673557, + "language_loss": 0.79725075, + "learning_rate": 1.7197168836367265e-06, + "loss": 0.81938761, + "num_input_tokens_seen": 100150685, + "step": 4644, + "time_per_iteration": 2.7516486644744873 + }, + { + "auxiliary_loss_clip": 0.01128563, + "auxiliary_loss_mlp": 0.00873067, + "balance_loss_clip": 1.02727151, + "balance_loss_mlp": 1.00017798, + "epoch": 0.5585282270185775, + "flos": 18843550375680.0, + "grad_norm": 1.8395105050085736, + "language_loss": 0.82004941, + "learning_rate": 1.7189456205671433e-06, + "loss": 0.84006566, + "num_input_tokens_seen": 100169530, + "step": 4645, + "time_per_iteration": 3.716125965118408 + }, + { + "auxiliary_loss_clip": 0.01117139, + "auxiliary_loss_mlp": 0.01085348, + "balance_loss_clip": 1.02968812, + "balance_loss_mlp": 1.00485837, + "epoch": 0.5586484699092166, + "flos": 21868449390720.0, + "grad_norm": 1.8360286363828668, + "language_loss": 0.81965339, + "learning_rate": 1.7181744001332866e-06, + "loss": 0.84167826, + "num_input_tokens_seen": 100188140, + "step": 4646, + "time_per_iteration": 2.662182569503784 + }, + { + "auxiliary_loss_clip": 0.01139526, + "auxiliary_loss_mlp": 0.01085647, + "balance_loss_clip": 1.02881587, + "balance_loss_mlp": 1.00530028, + "epoch": 0.5587687127998557, + "flos": 22893232412160.0, + "grad_norm": 1.8226285417629182, + "language_loss": 0.63278997, + "learning_rate": 1.7174032224521493e-06, + "loss": 0.65504175, + "num_input_tokens_seen": 100206850, + "step": 4647, + "time_per_iteration": 3.601741313934326 + }, + { + "auxiliary_loss_clip": 0.01129915, + "auxiliary_loss_mlp": 0.01083611, + "balance_loss_clip": 1.02730715, + "balance_loss_mlp": 1.00326443, + "epoch": 0.5588889556904948, + "flos": 20303067703680.0, + "grad_norm": 1.628732215264398, + "language_loss": 0.6989643, + "learning_rate": 1.7166320876407184e-06, + "loss": 0.72109956, + "num_input_tokens_seen": 100226270, + "step": 4648, + "time_per_iteration": 2.6352145671844482 + }, + { + "auxiliary_loss_clip": 0.01141604, + "auxiliary_loss_mlp": 0.00873101, + "balance_loss_clip": 1.03010571, + "balance_loss_mlp": 1.00019217, + "epoch": 0.5590091985811338, + "flos": 16472153450880.0, + "grad_norm": 1.907873018803795, + "language_loss": 0.67430824, + "learning_rate": 1.7158609958159742e-06, + "loss": 0.69445527, + "num_input_tokens_seen": 100243675, + "step": 4649, + "time_per_iteration": 2.6336097717285156 + }, + { + "auxiliary_loss_clip": 0.01075453, + "auxiliary_loss_mlp": 0.01085308, + "balance_loss_clip": 1.02328503, + "balance_loss_mlp": 1.00481844, + "epoch": 0.559129441471773, + "flos": 14532186781440.0, + "grad_norm": 1.9979123355824122, + "language_loss": 0.78235692, + "learning_rate": 1.7150899470948911e-06, + "loss": 0.8039645, + "num_input_tokens_seen": 100258940, + "step": 4650, + "time_per_iteration": 2.860593318939209 + }, + { + "auxiliary_loss_clip": 0.01104055, + "auxiliary_loss_mlp": 0.01079736, + "balance_loss_clip": 1.02433348, + "balance_loss_mlp": 1.0007714, + "epoch": 0.5592496843624121, + "flos": 60521009852160.0, + "grad_norm": 0.7996459269205111, + "language_loss": 0.56641728, + "learning_rate": 1.7143189415944365e-06, + "loss": 0.58825523, + "num_input_tokens_seen": 100323400, + "step": 4651, + "time_per_iteration": 3.3113179206848145 + }, + { + "auxiliary_loss_clip": 0.01128517, + "auxiliary_loss_mlp": 0.01084908, + "balance_loss_clip": 1.02685905, + "balance_loss_mlp": 1.00446618, + "epoch": 0.5593699272530511, + "flos": 20886256920960.0, + "grad_norm": 1.5743038594684908, + "language_loss": 0.76381099, + "learning_rate": 1.7135479794315714e-06, + "loss": 0.78594518, + "num_input_tokens_seen": 100340355, + "step": 4652, + "time_per_iteration": 2.7454700469970703 + }, + { + "auxiliary_loss_clip": 0.01107798, + "auxiliary_loss_mlp": 0.01085028, + "balance_loss_clip": 1.02455807, + "balance_loss_mlp": 1.00468135, + "epoch": 0.5594901701436903, + "flos": 12896743616640.0, + "grad_norm": 1.9366171678511894, + "language_loss": 0.78728151, + "learning_rate": 1.7127770607232502e-06, + "loss": 0.80920976, + "num_input_tokens_seen": 100358900, + "step": 4653, + "time_per_iteration": 2.7336456775665283 + }, + { + "auxiliary_loss_clip": 0.01111169, + "auxiliary_loss_mlp": 0.01085631, + "balance_loss_clip": 1.0254159, + "balance_loss_mlp": 1.0053314, + "epoch": 0.5596104130343293, + "flos": 23112107936640.0, + "grad_norm": 1.9174045337569134, + "language_loss": 0.79674315, + "learning_rate": 1.7120061855864204e-06, + "loss": 0.81871116, + "num_input_tokens_seen": 100378910, + "step": 4654, + "time_per_iteration": 2.7687366008758545 + }, + { + "auxiliary_loss_clip": 0.01131104, + "auxiliary_loss_mlp": 0.01085262, + "balance_loss_clip": 1.02908373, + "balance_loss_mlp": 1.00486684, + "epoch": 0.5597306559249684, + "flos": 25957812977280.0, + "grad_norm": 1.8089218892439733, + "language_loss": 0.70946205, + "learning_rate": 1.7112353541380233e-06, + "loss": 0.73162568, + "num_input_tokens_seen": 100398770, + "step": 4655, + "time_per_iteration": 2.748847007751465 + }, + { + "auxiliary_loss_clip": 0.01122498, + "auxiliary_loss_mlp": 0.01084533, + "balance_loss_clip": 1.02919471, + "balance_loss_mlp": 1.00404322, + "epoch": 0.5598508988156076, + "flos": 22492289825280.0, + "grad_norm": 1.369229907793638, + "language_loss": 0.72011811, + "learning_rate": 1.7104645664949931e-06, + "loss": 0.74218839, + "num_input_tokens_seen": 100421240, + "step": 4656, + "time_per_iteration": 2.713825225830078 + }, + { + "auxiliary_loss_clip": 0.01121303, + "auxiliary_loss_mlp": 0.01084085, + "balance_loss_clip": 1.02693248, + "balance_loss_mlp": 1.00368989, + "epoch": 0.5599711417062466, + "flos": 23112538899840.0, + "grad_norm": 1.7093426226608268, + "language_loss": 0.71187544, + "learning_rate": 1.7096938227742584e-06, + "loss": 0.73392934, + "num_input_tokens_seen": 100442370, + "step": 4657, + "time_per_iteration": 2.7260613441467285 + }, + { + "auxiliary_loss_clip": 0.011384, + "auxiliary_loss_mlp": 0.01084875, + "balance_loss_clip": 1.02776337, + "balance_loss_mlp": 1.00452781, + "epoch": 0.5600913845968857, + "flos": 22339345714560.0, + "grad_norm": 1.749704734810106, + "language_loss": 0.84465832, + "learning_rate": 1.70892312309274e-06, + "loss": 0.86689103, + "num_input_tokens_seen": 100460260, + "step": 4658, + "time_per_iteration": 2.6298680305480957 + }, + { + "auxiliary_loss_clip": 0.01122339, + "auxiliary_loss_mlp": 0.01085036, + "balance_loss_clip": 1.0268209, + "balance_loss_mlp": 1.00468934, + "epoch": 0.5602116274875248, + "flos": 17633791290240.0, + "grad_norm": 1.8322899793745917, + "language_loss": 0.68866378, + "learning_rate": 1.7081524675673523e-06, + "loss": 0.71073759, + "num_input_tokens_seen": 100475750, + "step": 4659, + "time_per_iteration": 2.716212034225464 + }, + { + "auxiliary_loss_clip": 0.01102881, + "auxiliary_loss_mlp": 0.0107932, + "balance_loss_clip": 1.02286816, + "balance_loss_mlp": 0.99997389, + "epoch": 0.5603318703781639, + "flos": 70115945529600.0, + "grad_norm": 0.7660932310575435, + "language_loss": 0.5964247, + "learning_rate": 1.7073818563150026e-06, + "loss": 0.61824679, + "num_input_tokens_seen": 100537830, + "step": 4660, + "time_per_iteration": 3.341806650161743 + }, + { + "auxiliary_loss_clip": 0.01129935, + "auxiliary_loss_mlp": 0.01083974, + "balance_loss_clip": 1.02670014, + "balance_loss_mlp": 1.00348353, + "epoch": 0.560452113268803, + "flos": 18545850455040.0, + "grad_norm": 2.144330685527052, + "language_loss": 0.86471045, + "learning_rate": 1.7066112894525935e-06, + "loss": 0.88684952, + "num_input_tokens_seen": 100555910, + "step": 4661, + "time_per_iteration": 2.7210373878479004 + }, + { + "auxiliary_loss_clip": 0.01119763, + "auxiliary_loss_mlp": 0.01085197, + "balance_loss_clip": 1.02651799, + "balance_loss_mlp": 1.00461173, + "epoch": 0.5605723561594421, + "flos": 25264665250560.0, + "grad_norm": 1.6767313659235201, + "language_loss": 0.72461176, + "learning_rate": 1.7058407670970177e-06, + "loss": 0.74666142, + "num_input_tokens_seen": 100577385, + "step": 4662, + "time_per_iteration": 2.781998872756958 + }, + { + "auxiliary_loss_clip": 0.01113547, + "auxiliary_loss_mlp": 0.01085202, + "balance_loss_clip": 1.0271498, + "balance_loss_mlp": 1.00466466, + "epoch": 0.5606925990500812, + "flos": 20594949621120.0, + "grad_norm": 2.076791445252128, + "language_loss": 0.6136471, + "learning_rate": 1.7050702893651643e-06, + "loss": 0.63563454, + "num_input_tokens_seen": 100596965, + "step": 4663, + "time_per_iteration": 2.7310285568237305 + }, + { + "auxiliary_loss_clip": 0.01129311, + "auxiliary_loss_mlp": 0.01085705, + "balance_loss_clip": 1.0272491, + "balance_loss_mlp": 1.00521469, + "epoch": 0.5608128419407202, + "flos": 35006044677120.0, + "grad_norm": 2.0002423911994893, + "language_loss": 0.75805312, + "learning_rate": 1.7042998563739134e-06, + "loss": 0.78020328, + "num_input_tokens_seen": 100615315, + "step": 4664, + "time_per_iteration": 2.7848548889160156 + }, + { + "auxiliary_loss_clip": 0.01121841, + "auxiliary_loss_mlp": 0.01085236, + "balance_loss_clip": 1.0272119, + "balance_loss_mlp": 1.0046984, + "epoch": 0.5609330848313594, + "flos": 24639819235200.0, + "grad_norm": 2.6409933749196686, + "language_loss": 0.71511769, + "learning_rate": 1.703529468240139e-06, + "loss": 0.73718852, + "num_input_tokens_seen": 100634185, + "step": 4665, + "time_per_iteration": 2.7639565467834473 + }, + { + "auxiliary_loss_clip": 0.01112261, + "auxiliary_loss_mlp": 0.01084339, + "balance_loss_clip": 1.02500701, + "balance_loss_mlp": 1.00389624, + "epoch": 0.5610533277219985, + "flos": 18762894385920.0, + "grad_norm": 2.0731783903735588, + "language_loss": 0.73312163, + "learning_rate": 1.7027591250807088e-06, + "loss": 0.75508761, + "num_input_tokens_seen": 100651360, + "step": 4666, + "time_per_iteration": 2.7322754859924316 + }, + { + "auxiliary_loss_clip": 0.01140551, + "auxiliary_loss_mlp": 0.01084459, + "balance_loss_clip": 1.02938056, + "balance_loss_mlp": 1.00401664, + "epoch": 0.5611735706126375, + "flos": 15012384727680.0, + "grad_norm": 2.044988407255082, + "language_loss": 0.84743023, + "learning_rate": 1.7019888270124825e-06, + "loss": 0.86968029, + "num_input_tokens_seen": 100668525, + "step": 4667, + "time_per_iteration": 3.541694164276123 + }, + { + "auxiliary_loss_clip": 0.01132632, + "auxiliary_loss_mlp": 0.01085391, + "balance_loss_clip": 1.02932262, + "balance_loss_mlp": 1.00499678, + "epoch": 0.5612938135032767, + "flos": 16468167041280.0, + "grad_norm": 1.9267423266203567, + "language_loss": 0.82123947, + "learning_rate": 1.7012185741523147e-06, + "loss": 0.84341967, + "num_input_tokens_seen": 100684850, + "step": 4668, + "time_per_iteration": 3.6344032287597656 + }, + { + "auxiliary_loss_clip": 0.01140143, + "auxiliary_loss_mlp": 0.01084876, + "balance_loss_clip": 1.02924705, + "balance_loss_mlp": 1.00452864, + "epoch": 0.5614140563939157, + "flos": 25666433850240.0, + "grad_norm": 2.002744411235064, + "language_loss": 0.62780869, + "learning_rate": 1.7004483666170514e-06, + "loss": 0.65005887, + "num_input_tokens_seen": 100705345, + "step": 4669, + "time_per_iteration": 2.6821436882019043 + }, + { + "auxiliary_loss_clip": 0.01128884, + "auxiliary_loss_mlp": 0.01085341, + "balance_loss_clip": 1.02699244, + "balance_loss_mlp": 1.00494647, + "epoch": 0.5615342992845548, + "flos": 24717566223360.0, + "grad_norm": 2.1486107786854123, + "language_loss": 0.80588412, + "learning_rate": 1.699678204523533e-06, + "loss": 0.82802641, + "num_input_tokens_seen": 100725210, + "step": 4670, + "time_per_iteration": 3.742093801498413 + }, + { + "auxiliary_loss_clip": 0.01120714, + "auxiliary_loss_mlp": 0.01085289, + "balance_loss_clip": 1.02767336, + "balance_loss_mlp": 1.00484693, + "epoch": 0.5616545421751938, + "flos": 22015934634240.0, + "grad_norm": 2.4251954299704037, + "language_loss": 0.68973053, + "learning_rate": 1.6989080879885918e-06, + "loss": 0.71179062, + "num_input_tokens_seen": 100743070, + "step": 4671, + "time_per_iteration": 2.7648556232452393 + }, + { + "auxiliary_loss_clip": 0.01093225, + "auxiliary_loss_mlp": 0.01079396, + "balance_loss_clip": 1.02153802, + "balance_loss_mlp": 1.00005043, + "epoch": 0.561774785065833, + "flos": 53760358690560.0, + "grad_norm": 0.9039838024094087, + "language_loss": 0.61082333, + "learning_rate": 1.6981380171290544e-06, + "loss": 0.63254958, + "num_input_tokens_seen": 100804095, + "step": 4672, + "time_per_iteration": 4.221452474594116 + }, + { + "auxiliary_loss_clip": 0.01105689, + "auxiliary_loss_mlp": 0.01085852, + "balance_loss_clip": 1.02666926, + "balance_loss_mlp": 1.00536156, + "epoch": 0.5618950279564721, + "flos": 19750007018880.0, + "grad_norm": 2.019229173071373, + "language_loss": 0.74099094, + "learning_rate": 1.6973679920617396e-06, + "loss": 0.76290637, + "num_input_tokens_seen": 100821630, + "step": 4673, + "time_per_iteration": 2.765333652496338 + }, + { + "auxiliary_loss_clip": 0.01119087, + "auxiliary_loss_mlp": 0.01084937, + "balance_loss_clip": 1.02716947, + "balance_loss_mlp": 1.00444722, + "epoch": 0.5620152708471111, + "flos": 16800592435200.0, + "grad_norm": 1.8814034563675903, + "language_loss": 0.84769636, + "learning_rate": 1.6965980129034603e-06, + "loss": 0.86973661, + "num_input_tokens_seen": 100839015, + "step": 4674, + "time_per_iteration": 2.7360129356384277 + }, + { + "auxiliary_loss_clip": 0.01120634, + "auxiliary_loss_mlp": 0.01085035, + "balance_loss_clip": 1.02772689, + "balance_loss_mlp": 1.00463986, + "epoch": 0.5621355137377503, + "flos": 26797799502720.0, + "grad_norm": 1.6245802502578839, + "language_loss": 0.76801324, + "learning_rate": 1.6958280797710209e-06, + "loss": 0.79006994, + "num_input_tokens_seen": 100860940, + "step": 4675, + "time_per_iteration": 2.84261155128479 + }, + { + "auxiliary_loss_clip": 0.01101319, + "auxiliary_loss_mlp": 0.01079292, + "balance_loss_clip": 1.0219816, + "balance_loss_mlp": 1.00032806, + "epoch": 0.5622557566283893, + "flos": 61207046686080.0, + "grad_norm": 0.7130426812868604, + "language_loss": 0.54720217, + "learning_rate": 1.6950581927812198e-06, + "loss": 0.56900823, + "num_input_tokens_seen": 100920510, + "step": 4676, + "time_per_iteration": 3.2183725833892822 + }, + { + "auxiliary_loss_clip": 0.01129002, + "auxiliary_loss_mlp": 0.01085141, + "balance_loss_clip": 1.02638435, + "balance_loss_mlp": 1.00474596, + "epoch": 0.5623759995190284, + "flos": 26468534505600.0, + "grad_norm": 1.9861796432881025, + "language_loss": 0.78958917, + "learning_rate": 1.6942883520508486e-06, + "loss": 0.81173062, + "num_input_tokens_seen": 100939245, + "step": 4677, + "time_per_iteration": 2.704620361328125 + }, + { + "auxiliary_loss_clip": 0.0111503, + "auxiliary_loss_mlp": 0.01085834, + "balance_loss_clip": 1.02942443, + "balance_loss_mlp": 1.00543952, + "epoch": 0.5624962424096676, + "flos": 19390900798080.0, + "grad_norm": 4.102528114712967, + "language_loss": 0.76953262, + "learning_rate": 1.693518557696691e-06, + "loss": 0.79154128, + "num_input_tokens_seen": 100958385, + "step": 4678, + "time_per_iteration": 2.670607328414917 + }, + { + "auxiliary_loss_clip": 0.01129185, + "auxiliary_loss_mlp": 0.01085266, + "balance_loss_clip": 1.02661824, + "balance_loss_mlp": 1.00487137, + "epoch": 0.5626164853003066, + "flos": 20667345482880.0, + "grad_norm": 2.145345592328796, + "language_loss": 0.89215487, + "learning_rate": 1.6927488098355252e-06, + "loss": 0.91429937, + "num_input_tokens_seen": 100976015, + "step": 4679, + "time_per_iteration": 2.7029659748077393 + }, + { + "auxiliary_loss_clip": 0.01068822, + "auxiliary_loss_mlp": 0.01079016, + "balance_loss_clip": 1.02135682, + "balance_loss_mlp": 1.00005138, + "epoch": 0.5627367281909457, + "flos": 62766071665920.0, + "grad_norm": 0.9081077956341698, + "language_loss": 0.63236505, + "learning_rate": 1.6919791085841201e-06, + "loss": 0.65384346, + "num_input_tokens_seen": 101033425, + "step": 4680, + "time_per_iteration": 3.303849458694458 + }, + { + "auxiliary_loss_clip": 0.01129231, + "auxiliary_loss_mlp": 0.0108524, + "balance_loss_clip": 1.02621078, + "balance_loss_mlp": 1.00470197, + "epoch": 0.5628569710815848, + "flos": 12787144243200.0, + "grad_norm": 2.475080407096828, + "language_loss": 0.78972816, + "learning_rate": 1.6912094540592396e-06, + "loss": 0.81187284, + "num_input_tokens_seen": 101048945, + "step": 4681, + "time_per_iteration": 2.6843361854553223 + }, + { + "auxiliary_loss_clip": 0.01127024, + "auxiliary_loss_mlp": 0.0108432, + "balance_loss_clip": 1.026142, + "balance_loss_mlp": 1.00383019, + "epoch": 0.5629772139722239, + "flos": 13762082165760.0, + "grad_norm": 2.610283157467322, + "language_loss": 0.81654781, + "learning_rate": 1.6904398463776393e-06, + "loss": 0.83866119, + "num_input_tokens_seen": 101062745, + "step": 4682, + "time_per_iteration": 2.6302287578582764 + }, + { + "auxiliary_loss_clip": 0.01130503, + "auxiliary_loss_mlp": 0.0108379, + "balance_loss_clip": 1.02734959, + "balance_loss_mlp": 1.00344324, + "epoch": 0.5630974568628629, + "flos": 21467830026240.0, + "grad_norm": 2.108796190916883, + "language_loss": 0.72759819, + "learning_rate": 1.6896702856560683e-06, + "loss": 0.7497412, + "num_input_tokens_seen": 101081840, + "step": 4683, + "time_per_iteration": 2.743098497390747 + }, + { + "auxiliary_loss_clip": 0.01113455, + "auxiliary_loss_mlp": 0.01085607, + "balance_loss_clip": 1.02666712, + "balance_loss_mlp": 1.00521183, + "epoch": 0.5632176997535021, + "flos": 14245907385600.0, + "grad_norm": 2.7431686898590977, + "language_loss": 0.69456422, + "learning_rate": 1.6889007720112677e-06, + "loss": 0.71655488, + "num_input_tokens_seen": 101099585, + "step": 4684, + "time_per_iteration": 2.728224039077759 + }, + { + "auxiliary_loss_clip": 0.0112956, + "auxiliary_loss_mlp": 0.01084671, + "balance_loss_clip": 1.02691174, + "balance_loss_mlp": 1.00427628, + "epoch": 0.5633379426441412, + "flos": 20812244947200.0, + "grad_norm": 1.579845736008536, + "language_loss": 0.77308476, + "learning_rate": 1.6881313055599734e-06, + "loss": 0.79522705, + "num_input_tokens_seen": 101119515, + "step": 4685, + "time_per_iteration": 2.68013596534729 + }, + { + "auxiliary_loss_clip": 0.01122089, + "auxiliary_loss_mlp": 0.01085238, + "balance_loss_clip": 1.02692127, + "balance_loss_mlp": 1.00470042, + "epoch": 0.5634581855347802, + "flos": 22600883617920.0, + "grad_norm": 2.132514683008261, + "language_loss": 0.82317638, + "learning_rate": 1.6873618864189117e-06, + "loss": 0.84524965, + "num_input_tokens_seen": 101135285, + "step": 4686, + "time_per_iteration": 2.791358709335327 + }, + { + "auxiliary_loss_clip": 0.01129189, + "auxiliary_loss_mlp": 0.01085001, + "balance_loss_clip": 1.02721965, + "balance_loss_mlp": 1.00455809, + "epoch": 0.5635784284254194, + "flos": 21506972872320.0, + "grad_norm": 2.397585888120513, + "language_loss": 0.78488183, + "learning_rate": 1.686592514704803e-06, + "loss": 0.80702376, + "num_input_tokens_seen": 101152680, + "step": 4687, + "time_per_iteration": 2.7346038818359375 + }, + { + "auxiliary_loss_clip": 0.01120409, + "auxiliary_loss_mlp": 0.01085301, + "balance_loss_clip": 1.02722692, + "balance_loss_mlp": 1.00500166, + "epoch": 0.5636986713160584, + "flos": 19827466698240.0, + "grad_norm": 2.618230256109424, + "language_loss": 0.70608526, + "learning_rate": 1.685823190534361e-06, + "loss": 0.72814238, + "num_input_tokens_seen": 101170920, + "step": 4688, + "time_per_iteration": 2.7069075107574463 + }, + { + "auxiliary_loss_clip": 0.01139677, + "auxiliary_loss_mlp": 0.01086009, + "balance_loss_clip": 1.02801752, + "balance_loss_mlp": 1.00547123, + "epoch": 0.5638189142066975, + "flos": 19792453916160.0, + "grad_norm": 3.4851181084390377, + "language_loss": 0.8361941, + "learning_rate": 1.6850539140242907e-06, + "loss": 0.85845101, + "num_input_tokens_seen": 101190180, + "step": 4689, + "time_per_iteration": 2.6869494915008545 + }, + { + "auxiliary_loss_clip": 0.01113951, + "auxiliary_loss_mlp": 0.01084722, + "balance_loss_clip": 1.02739811, + "balance_loss_mlp": 1.00427926, + "epoch": 0.5639391570973367, + "flos": 22893771116160.0, + "grad_norm": 1.689161788979471, + "language_loss": 0.823327, + "learning_rate": 1.684284685291292e-06, + "loss": 0.84531379, + "num_input_tokens_seen": 101211825, + "step": 4690, + "time_per_iteration": 2.7112278938293457 + }, + { + "auxiliary_loss_clip": 0.01138257, + "auxiliary_loss_mlp": 0.01084921, + "balance_loss_clip": 1.02744222, + "balance_loss_mlp": 1.00447893, + "epoch": 0.5640593999879757, + "flos": 23727077712000.0, + "grad_norm": 1.911010118732395, + "language_loss": 0.81580216, + "learning_rate": 1.683515504452055e-06, + "loss": 0.83803397, + "num_input_tokens_seen": 101229200, + "step": 4691, + "time_per_iteration": 2.6313018798828125 + }, + { + "auxiliary_loss_clip": 0.01103745, + "auxiliary_loss_mlp": 0.01084252, + "balance_loss_clip": 1.02408123, + "balance_loss_mlp": 1.00371408, + "epoch": 0.5641796428786148, + "flos": 22710123855360.0, + "grad_norm": 1.789726194157115, + "language_loss": 0.66538018, + "learning_rate": 1.6827463716232648e-06, + "loss": 0.68726015, + "num_input_tokens_seen": 101249860, + "step": 4692, + "time_per_iteration": 3.6823434829711914 + }, + { + "auxiliary_loss_clip": 0.01130714, + "auxiliary_loss_mlp": 0.00872957, + "balance_loss_clip": 1.02767062, + "balance_loss_mlp": 1.00012648, + "epoch": 0.5642998857692539, + "flos": 19791987039360.0, + "grad_norm": 1.6688857685290477, + "language_loss": 0.75589257, + "learning_rate": 1.6819772869215972e-06, + "loss": 0.77592933, + "num_input_tokens_seen": 101268940, + "step": 4693, + "time_per_iteration": 3.631471872329712 + }, + { + "auxiliary_loss_clip": 0.01122306, + "auxiliary_loss_mlp": 0.01085258, + "balance_loss_clip": 1.02771533, + "balance_loss_mlp": 1.00495875, + "epoch": 0.564420128659893, + "flos": 23185904428800.0, + "grad_norm": 1.7015435357137245, + "language_loss": 0.82494318, + "learning_rate": 1.6812082504637228e-06, + "loss": 0.84701884, + "num_input_tokens_seen": 101290260, + "step": 4694, + "time_per_iteration": 2.710496664047241 + }, + { + "auxiliary_loss_clip": 0.01129038, + "auxiliary_loss_mlp": 0.0108441, + "balance_loss_clip": 1.02794766, + "balance_loss_mlp": 1.00406265, + "epoch": 0.564540371550532, + "flos": 23258264376960.0, + "grad_norm": 1.4183830443616547, + "language_loss": 0.74384892, + "learning_rate": 1.6804392623663025e-06, + "loss": 0.7659834, + "num_input_tokens_seen": 101311465, + "step": 4695, + "time_per_iteration": 3.6997458934783936 + }, + { + "auxiliary_loss_clip": 0.01130041, + "auxiliary_loss_mlp": 0.01084405, + "balance_loss_clip": 1.02871501, + "balance_loss_mlp": 1.00400996, + "epoch": 0.5646606144411712, + "flos": 25010058672000.0, + "grad_norm": 12.471679068162532, + "language_loss": 0.78511995, + "learning_rate": 1.6796703227459935e-06, + "loss": 0.80726445, + "num_input_tokens_seen": 101329420, + "step": 4696, + "time_per_iteration": 2.7186944484710693 + }, + { + "auxiliary_loss_clip": 0.01100223, + "auxiliary_loss_mlp": 0.01083767, + "balance_loss_clip": 1.02423608, + "balance_loss_mlp": 1.00341976, + "epoch": 0.5647808573318103, + "flos": 36539645806080.0, + "grad_norm": 2.7279430338890878, + "language_loss": 0.76120377, + "learning_rate": 1.6789014317194407e-06, + "loss": 0.78304362, + "num_input_tokens_seen": 101350900, + "step": 4697, + "time_per_iteration": 2.9751217365264893 + }, + { + "auxiliary_loss_clip": 0.01105306, + "auxiliary_loss_mlp": 0.01085562, + "balance_loss_clip": 1.02750111, + "balance_loss_mlp": 1.00502443, + "epoch": 0.5649011002224493, + "flos": 22528451842560.0, + "grad_norm": 3.0683965653007537, + "language_loss": 0.73001003, + "learning_rate": 1.6781325894032853e-06, + "loss": 0.75191867, + "num_input_tokens_seen": 101369860, + "step": 4698, + "time_per_iteration": 3.6848130226135254 + }, + { + "auxiliary_loss_clip": 0.01111971, + "auxiliary_loss_mlp": 0.01085553, + "balance_loss_clip": 1.02155149, + "balance_loss_mlp": 1.00515795, + "epoch": 0.5650213431130885, + "flos": 18515147304960.0, + "grad_norm": 1.9560002210345506, + "language_loss": 0.91957939, + "learning_rate": 1.6773637959141608e-06, + "loss": 0.94155461, + "num_input_tokens_seen": 101386835, + "step": 4699, + "time_per_iteration": 2.708812952041626 + }, + { + "auxiliary_loss_clip": 0.01119581, + "auxiliary_loss_mlp": 0.01085189, + "balance_loss_clip": 1.02629352, + "balance_loss_mlp": 1.00479376, + "epoch": 0.5651415860037275, + "flos": 17526310819200.0, + "grad_norm": 2.6258937389749013, + "language_loss": 0.66483569, + "learning_rate": 1.6765950513686915e-06, + "loss": 0.68688333, + "num_input_tokens_seen": 101404945, + "step": 4700, + "time_per_iteration": 2.711940050125122 + }, + { + "auxiliary_loss_clip": 0.01086651, + "auxiliary_loss_mlp": 0.01085618, + "balance_loss_clip": 1.02563584, + "balance_loss_mlp": 1.00512803, + "epoch": 0.5652618288943666, + "flos": 25520026014720.0, + "grad_norm": 1.608398808862499, + "language_loss": 0.76070851, + "learning_rate": 1.675826355883496e-06, + "loss": 0.78243124, + "num_input_tokens_seen": 101424160, + "step": 4701, + "time_per_iteration": 2.8204431533813477 + }, + { + "auxiliary_loss_clip": 0.01117893, + "auxiliary_loss_mlp": 0.0108499, + "balance_loss_clip": 1.02602863, + "balance_loss_mlp": 1.00464308, + "epoch": 0.5653820717850057, + "flos": 19683105937920.0, + "grad_norm": 1.8604302127030488, + "language_loss": 0.79665464, + "learning_rate": 1.6750577095751848e-06, + "loss": 0.81868345, + "num_input_tokens_seen": 101443270, + "step": 4702, + "time_per_iteration": 2.7155017852783203 + }, + { + "auxiliary_loss_clip": 0.01139006, + "auxiliary_loss_mlp": 0.01084839, + "balance_loss_clip": 1.02830291, + "balance_loss_mlp": 1.00458694, + "epoch": 0.5655023146756448, + "flos": 26979722910720.0, + "grad_norm": 1.9145539116848376, + "language_loss": 0.7292012, + "learning_rate": 1.6742891125603605e-06, + "loss": 0.75143963, + "num_input_tokens_seen": 101464175, + "step": 4703, + "time_per_iteration": 2.6976263523101807 + }, + { + "auxiliary_loss_clip": 0.01129319, + "auxiliary_loss_mlp": 0.01085995, + "balance_loss_clip": 1.02800083, + "balance_loss_mlp": 1.00550461, + "epoch": 0.5656225575662839, + "flos": 27669351104640.0, + "grad_norm": 1.883420323776326, + "language_loss": 0.72004229, + "learning_rate": 1.6735205649556185e-06, + "loss": 0.74219543, + "num_input_tokens_seen": 101484045, + "step": 4704, + "time_per_iteration": 2.721463441848755 + }, + { + "auxiliary_loss_clip": 0.01095653, + "auxiliary_loss_mlp": 0.01085502, + "balance_loss_clip": 1.02681446, + "balance_loss_mlp": 1.00510693, + "epoch": 0.5657428004569229, + "flos": 24349732997760.0, + "grad_norm": 1.7340426662031498, + "language_loss": 0.84896678, + "learning_rate": 1.6727520668775476e-06, + "loss": 0.87077832, + "num_input_tokens_seen": 101504330, + "step": 4705, + "time_per_iteration": 2.78488826751709 + }, + { + "auxiliary_loss_clip": 0.01138399, + "auxiliary_loss_mlp": 0.01084872, + "balance_loss_clip": 1.02693117, + "balance_loss_mlp": 1.00442922, + "epoch": 0.5658630433475621, + "flos": 21944041562880.0, + "grad_norm": 1.4903442868314671, + "language_loss": 0.75305283, + "learning_rate": 1.6719836184427275e-06, + "loss": 0.77528548, + "num_input_tokens_seen": 101524635, + "step": 4706, + "time_per_iteration": 2.7280149459838867 + }, + { + "auxiliary_loss_clip": 0.01118258, + "auxiliary_loss_mlp": 0.01084548, + "balance_loss_clip": 1.02542675, + "balance_loss_mlp": 1.00415349, + "epoch": 0.5659832862382012, + "flos": 30409012218240.0, + "grad_norm": 1.9993540899620543, + "language_loss": 0.6471616, + "learning_rate": 1.671215219767733e-06, + "loss": 0.66918969, + "num_input_tokens_seen": 101544095, + "step": 4707, + "time_per_iteration": 2.8833234310150146 + }, + { + "auxiliary_loss_clip": 0.01104639, + "auxiliary_loss_mlp": 0.01085647, + "balance_loss_clip": 1.02598405, + "balance_loss_mlp": 1.00515723, + "epoch": 0.5661035291288402, + "flos": 13188194570880.0, + "grad_norm": 1.9802394611661946, + "language_loss": 0.76269889, + "learning_rate": 1.670446870969127e-06, + "loss": 0.78460181, + "num_input_tokens_seen": 101561760, + "step": 4708, + "time_per_iteration": 2.851649284362793 + }, + { + "auxiliary_loss_clip": 0.0110257, + "auxiliary_loss_mlp": 0.01084753, + "balance_loss_clip": 1.02549911, + "balance_loss_mlp": 1.00435817, + "epoch": 0.5662237720194794, + "flos": 16143032108160.0, + "grad_norm": 2.011240384713942, + "language_loss": 0.79707748, + "learning_rate": 1.6696785721634685e-06, + "loss": 0.81895071, + "num_input_tokens_seen": 101576245, + "step": 4709, + "time_per_iteration": 2.803295612335205 + }, + { + "auxiliary_loss_clip": 0.011313, + "auxiliary_loss_mlp": 0.01084324, + "balance_loss_clip": 1.02806425, + "balance_loss_mlp": 1.0039289, + "epoch": 0.5663440149101184, + "flos": 17676848718720.0, + "grad_norm": 1.6971834984281151, + "language_loss": 0.73321998, + "learning_rate": 1.6689103234673086e-06, + "loss": 0.75537622, + "num_input_tokens_seen": 101594565, + "step": 4710, + "time_per_iteration": 2.757575273513794 + }, + { + "auxiliary_loss_clip": 0.01118579, + "auxiliary_loss_mlp": 0.01085624, + "balance_loss_clip": 1.02574742, + "balance_loss_mlp": 1.00518167, + "epoch": 0.5664642578007575, + "flos": 23368330627200.0, + "grad_norm": 1.7886448195442488, + "language_loss": 0.77225959, + "learning_rate": 1.668142124997189e-06, + "loss": 0.79430163, + "num_input_tokens_seen": 101614225, + "step": 4711, + "time_per_iteration": 2.7338085174560547 + }, + { + "auxiliary_loss_clip": 0.01096451, + "auxiliary_loss_mlp": 0.01079438, + "balance_loss_clip": 1.02681339, + "balance_loss_mlp": 1.00047386, + "epoch": 0.5665845006913967, + "flos": 65516470945920.0, + "grad_norm": 0.7257105017580725, + "language_loss": 0.59803182, + "learning_rate": 1.6673739768696453e-06, + "loss": 0.61979073, + "num_input_tokens_seen": 101680795, + "step": 4712, + "time_per_iteration": 3.2973437309265137 + }, + { + "auxiliary_loss_clip": 0.0112038, + "auxiliary_loss_mlp": 0.01085693, + "balance_loss_clip": 1.02573931, + "balance_loss_mlp": 1.00525117, + "epoch": 0.5667047435820357, + "flos": 26140885620480.0, + "grad_norm": 1.659853955063984, + "language_loss": 0.77309525, + "learning_rate": 1.6666058792012052e-06, + "loss": 0.795156, + "num_input_tokens_seen": 101701680, + "step": 4713, + "time_per_iteration": 2.7434728145599365 + }, + { + "auxiliary_loss_clip": 0.01110245, + "auxiliary_loss_mlp": 0.01079224, + "balance_loss_clip": 1.02186489, + "balance_loss_mlp": 1.00026023, + "epoch": 0.5668249864726748, + "flos": 71866949725440.0, + "grad_norm": 0.876153527472572, + "language_loss": 0.68824553, + "learning_rate": 1.6658378321083878e-06, + "loss": 0.71014023, + "num_input_tokens_seen": 101766010, + "step": 4714, + "time_per_iteration": 3.32908034324646 + }, + { + "auxiliary_loss_clip": 0.01071064, + "auxiliary_loss_mlp": 0.01084975, + "balance_loss_clip": 1.02478814, + "balance_loss_mlp": 1.00462794, + "epoch": 0.5669452293633139, + "flos": 22195667312640.0, + "grad_norm": 1.6558067885488639, + "language_loss": 0.82457548, + "learning_rate": 1.6650698357077055e-06, + "loss": 0.84613585, + "num_input_tokens_seen": 101783055, + "step": 4715, + "time_per_iteration": 2.7862792015075684 + }, + { + "auxiliary_loss_clip": 0.01120584, + "auxiliary_loss_mlp": 0.01086604, + "balance_loss_clip": 1.02623546, + "balance_loss_mlp": 1.00611377, + "epoch": 0.567065472253953, + "flos": 18223193560320.0, + "grad_norm": 2.4999499731665695, + "language_loss": 0.80827951, + "learning_rate": 1.6643018901156632e-06, + "loss": 0.83035147, + "num_input_tokens_seen": 101802150, + "step": 4716, + "time_per_iteration": 2.714439868927002 + }, + { + "auxiliary_loss_clip": 0.01120316, + "auxiliary_loss_mlp": 0.01084822, + "balance_loss_clip": 1.02573538, + "balance_loss_mlp": 1.00437927, + "epoch": 0.567185715144592, + "flos": 20371548983040.0, + "grad_norm": 5.026823471540632, + "language_loss": 0.79222417, + "learning_rate": 1.6635339954487566e-06, + "loss": 0.81427556, + "num_input_tokens_seen": 101818025, + "step": 4717, + "time_per_iteration": 2.688342809677124 + }, + { + "auxiliary_loss_clip": 0.01119951, + "auxiliary_loss_mlp": 0.01084902, + "balance_loss_clip": 1.02593064, + "balance_loss_mlp": 1.00450706, + "epoch": 0.5673059580352312, + "flos": 23221348174080.0, + "grad_norm": 2.559784328736366, + "language_loss": 0.8200289, + "learning_rate": 1.6627661518234765e-06, + "loss": 0.84207743, + "num_input_tokens_seen": 101837280, + "step": 4718, + "time_per_iteration": 4.558771371841431 + }, + { + "auxiliary_loss_clip": 0.0109543, + "auxiliary_loss_mlp": 0.01085257, + "balance_loss_clip": 1.02104974, + "balance_loss_mlp": 1.00491023, + "epoch": 0.5674262009258703, + "flos": 21719599430400.0, + "grad_norm": 1.6489642878421091, + "language_loss": 0.85361791, + "learning_rate": 1.661998359356302e-06, + "loss": 0.8754248, + "num_input_tokens_seen": 101856310, + "step": 4719, + "time_per_iteration": 2.8295063972473145 + }, + { + "auxiliary_loss_clip": 0.01118046, + "auxiliary_loss_mlp": 0.01079484, + "balance_loss_clip": 1.02181005, + "balance_loss_mlp": 1.00013876, + "epoch": 0.5675464438165093, + "flos": 67470369114240.0, + "grad_norm": 0.7434002141631179, + "language_loss": 0.55871671, + "learning_rate": 1.6612306181637077e-06, + "loss": 0.58069205, + "num_input_tokens_seen": 101915635, + "step": 4720, + "time_per_iteration": 3.18971586227417 + }, + { + "auxiliary_loss_clip": 0.011096, + "auxiliary_loss_mlp": 0.01085522, + "balance_loss_clip": 1.02431035, + "balance_loss_mlp": 1.00507927, + "epoch": 0.5676666867071485, + "flos": 18879173688960.0, + "grad_norm": 1.941534782654811, + "language_loss": 0.65397334, + "learning_rate": 1.6604629283621598e-06, + "loss": 0.67592454, + "num_input_tokens_seen": 101933565, + "step": 4721, + "time_per_iteration": 3.6413447856903076 + }, + { + "auxiliary_loss_clip": 0.0113888, + "auxiliary_loss_mlp": 0.01086242, + "balance_loss_clip": 1.02764893, + "balance_loss_mlp": 1.00575209, + "epoch": 0.5677869295977875, + "flos": 33546778744320.0, + "grad_norm": 2.534888221107641, + "language_loss": 0.7447561, + "learning_rate": 1.6596952900681152e-06, + "loss": 0.76700735, + "num_input_tokens_seen": 101954325, + "step": 4722, + "time_per_iteration": 2.706533432006836 + }, + { + "auxiliary_loss_clip": 0.01095711, + "auxiliary_loss_mlp": 0.01086219, + "balance_loss_clip": 1.02152729, + "balance_loss_mlp": 1.00568128, + "epoch": 0.5679071724884266, + "flos": 28037256157440.0, + "grad_norm": 10.477540410093608, + "language_loss": 0.81718433, + "learning_rate": 1.658927703398025e-06, + "loss": 0.83900368, + "num_input_tokens_seen": 101974390, + "step": 4723, + "time_per_iteration": 2.82674503326416 + }, + { + "auxiliary_loss_clip": 0.01101227, + "auxiliary_loss_mlp": 0.01084918, + "balance_loss_clip": 1.02415907, + "balance_loss_mlp": 1.00452304, + "epoch": 0.5680274153790658, + "flos": 23550110380800.0, + "grad_norm": 2.9850333191524574, + "language_loss": 0.77733111, + "learning_rate": 1.6581601684683309e-06, + "loss": 0.79919261, + "num_input_tokens_seen": 101994815, + "step": 4724, + "time_per_iteration": 3.7610321044921875 + }, + { + "auxiliary_loss_clip": 0.0112766, + "auxiliary_loss_mlp": 0.01084871, + "balance_loss_clip": 1.02648461, + "balance_loss_mlp": 1.00447679, + "epoch": 0.5681476582697048, + "flos": 22455158140800.0, + "grad_norm": 3.060784586867934, + "language_loss": 0.68493241, + "learning_rate": 1.6573926853954674e-06, + "loss": 0.70705777, + "num_input_tokens_seen": 102012400, + "step": 4725, + "time_per_iteration": 2.7085282802581787 + }, + { + "auxiliary_loss_clip": 0.01122849, + "auxiliary_loss_mlp": 0.01084854, + "balance_loss_clip": 1.02822566, + "balance_loss_mlp": 1.00460291, + "epoch": 0.5682679011603439, + "flos": 19536913584000.0, + "grad_norm": 1.7886522853660727, + "language_loss": 0.83108675, + "learning_rate": 1.6566252542958608e-06, + "loss": 0.85316384, + "num_input_tokens_seen": 102031900, + "step": 4726, + "time_per_iteration": 2.6947977542877197 + }, + { + "auxiliary_loss_clip": 0.01108835, + "auxiliary_loss_mlp": 0.01085032, + "balance_loss_clip": 1.02510047, + "balance_loss_mlp": 1.0045892, + "epoch": 0.568388144050983, + "flos": 28765488493440.0, + "grad_norm": 2.086698502424485, + "language_loss": 0.78402466, + "learning_rate": 1.6558578752859305e-06, + "loss": 0.80596328, + "num_input_tokens_seen": 102050860, + "step": 4727, + "time_per_iteration": 2.848898410797119 + }, + { + "auxiliary_loss_clip": 0.01112632, + "auxiliary_loss_mlp": 0.01084247, + "balance_loss_clip": 1.02646899, + "balance_loss_mlp": 1.00404286, + "epoch": 0.5685083869416221, + "flos": 21209452519680.0, + "grad_norm": 1.7279236673817169, + "language_loss": 0.78569543, + "learning_rate": 1.6550905484820865e-06, + "loss": 0.80766416, + "num_input_tokens_seen": 102069320, + "step": 4728, + "time_per_iteration": 2.864713668823242 + }, + { + "auxiliary_loss_clip": 0.01138107, + "auxiliary_loss_mlp": 0.01084849, + "balance_loss_clip": 1.02704406, + "balance_loss_mlp": 1.00435925, + "epoch": 0.5686286298322611, + "flos": 24827021942400.0, + "grad_norm": 2.0795615158289213, + "language_loss": 0.78698111, + "learning_rate": 1.6543232740007328e-06, + "loss": 0.80921066, + "num_input_tokens_seen": 102086435, + "step": 4729, + "time_per_iteration": 2.8825600147247314 + }, + { + "auxiliary_loss_clip": 0.01130168, + "auxiliary_loss_mlp": 0.01085498, + "balance_loss_clip": 1.02791119, + "balance_loss_mlp": 1.00505567, + "epoch": 0.5687488727229003, + "flos": 26615121909120.0, + "grad_norm": 2.966897396112478, + "language_loss": 0.67109746, + "learning_rate": 1.653556051958263e-06, + "loss": 0.69325411, + "num_input_tokens_seen": 102106115, + "step": 4730, + "time_per_iteration": 2.750131368637085 + }, + { + "auxiliary_loss_clip": 0.0108944, + "auxiliary_loss_mlp": 0.01084139, + "balance_loss_clip": 1.02371407, + "balance_loss_mlp": 1.00379193, + "epoch": 0.5688691156135394, + "flos": 20808725414400.0, + "grad_norm": 1.855533882777244, + "language_loss": 0.73785806, + "learning_rate": 1.6527888824710642e-06, + "loss": 0.75959384, + "num_input_tokens_seen": 102125715, + "step": 4731, + "time_per_iteration": 2.8413703441619873 + }, + { + "auxiliary_loss_clip": 0.01110968, + "auxiliary_loss_mlp": 0.0108592, + "balance_loss_clip": 1.02485251, + "balance_loss_mlp": 1.00547743, + "epoch": 0.5689893585041784, + "flos": 25880963829120.0, + "grad_norm": 2.750235240766321, + "language_loss": 0.76736641, + "learning_rate": 1.6520217656555166e-06, + "loss": 0.78933525, + "num_input_tokens_seen": 102145005, + "step": 4732, + "time_per_iteration": 2.8020293712615967 + }, + { + "auxiliary_loss_clip": 0.01121038, + "auxiliary_loss_mlp": 0.01085657, + "balance_loss_clip": 1.02629995, + "balance_loss_mlp": 1.00526214, + "epoch": 0.5691096013948175, + "flos": 23477463123840.0, + "grad_norm": 1.4531641212014412, + "language_loss": 0.70960462, + "learning_rate": 1.65125470162799e-06, + "loss": 0.73167151, + "num_input_tokens_seen": 102165360, + "step": 4733, + "time_per_iteration": 2.7838246822357178 + }, + { + "auxiliary_loss_clip": 0.01108746, + "auxiliary_loss_mlp": 0.0108481, + "balance_loss_clip": 1.02437294, + "balance_loss_mlp": 1.00436783, + "epoch": 0.5692298442854566, + "flos": 18075600576000.0, + "grad_norm": 2.2115923537797784, + "language_loss": 0.69894218, + "learning_rate": 1.6504876905048485e-06, + "loss": 0.72087777, + "num_input_tokens_seen": 102182320, + "step": 4734, + "time_per_iteration": 2.7599282264709473 + }, + { + "auxiliary_loss_clip": 0.01138606, + "auxiliary_loss_mlp": 0.01086384, + "balance_loss_clip": 1.02753925, + "balance_loss_mlp": 1.00598907, + "epoch": 0.5693500871760957, + "flos": 23039317025280.0, + "grad_norm": 1.5514495389581444, + "language_loss": 0.71908367, + "learning_rate": 1.6497207324024464e-06, + "loss": 0.7413336, + "num_input_tokens_seen": 102201220, + "step": 4735, + "time_per_iteration": 2.633151054382324 + }, + { + "auxiliary_loss_clip": 0.01103919, + "auxiliary_loss_mlp": 0.01087007, + "balance_loss_clip": 1.02529335, + "balance_loss_mlp": 1.00642109, + "epoch": 0.5694703300667348, + "flos": 18989670902400.0, + "grad_norm": 1.8029619982648228, + "language_loss": 0.82622445, + "learning_rate": 1.6489538274371305e-06, + "loss": 0.84813368, + "num_input_tokens_seen": 102219825, + "step": 4736, + "time_per_iteration": 2.779789686203003 + }, + { + "auxiliary_loss_clip": 0.01128324, + "auxiliary_loss_mlp": 0.01084637, + "balance_loss_clip": 1.02688217, + "balance_loss_mlp": 1.00428963, + "epoch": 0.5695905729573739, + "flos": 21908705558400.0, + "grad_norm": 2.16985820953656, + "language_loss": 0.82905281, + "learning_rate": 1.6481869757252396e-06, + "loss": 0.8511824, + "num_input_tokens_seen": 102238160, + "step": 4737, + "time_per_iteration": 2.709388017654419 + }, + { + "auxiliary_loss_clip": 0.01129784, + "auxiliary_loss_mlp": 0.01084853, + "balance_loss_clip": 1.02774596, + "balance_loss_mlp": 1.00450611, + "epoch": 0.569710815848013, + "flos": 28476659232000.0, + "grad_norm": 1.5036071009744314, + "language_loss": 0.7185179, + "learning_rate": 1.647420177383105e-06, + "loss": 0.7406643, + "num_input_tokens_seen": 102261030, + "step": 4738, + "time_per_iteration": 2.7835938930511475 + }, + { + "auxiliary_loss_clip": 0.01127246, + "auxiliary_loss_mlp": 0.01083459, + "balance_loss_clip": 1.02651763, + "balance_loss_mlp": 1.00311184, + "epoch": 0.569831058738652, + "flos": 28366162018560.0, + "grad_norm": 2.143841547512868, + "language_loss": 0.72827637, + "learning_rate": 1.646653432527049e-06, + "loss": 0.75038338, + "num_input_tokens_seen": 102281670, + "step": 4739, + "time_per_iteration": 2.7553648948669434 + }, + { + "auxiliary_loss_clip": 0.01109255, + "auxiliary_loss_mlp": 0.01085308, + "balance_loss_clip": 1.02473855, + "balance_loss_mlp": 1.00505614, + "epoch": 0.5699513016292912, + "flos": 25849973370240.0, + "grad_norm": 1.5990960277647517, + "language_loss": 0.74390507, + "learning_rate": 1.645886741273387e-06, + "loss": 0.76585072, + "num_input_tokens_seen": 102303485, + "step": 4740, + "time_per_iteration": 2.7758796215057373 + }, + { + "auxiliary_loss_clip": 0.01096519, + "auxiliary_loss_mlp": 0.01085192, + "balance_loss_clip": 1.02091312, + "balance_loss_mlp": 1.0048449, + "epoch": 0.5700715445199303, + "flos": 18037858360320.0, + "grad_norm": 1.9751196042864363, + "language_loss": 0.73831761, + "learning_rate": 1.645120103738424e-06, + "loss": 0.7601347, + "num_input_tokens_seen": 102320995, + "step": 4741, + "time_per_iteration": 2.8246726989746094 + }, + { + "auxiliary_loss_clip": 0.01129873, + "auxiliary_loss_mlp": 0.00873008, + "balance_loss_clip": 1.02778125, + "balance_loss_mlp": 1.000144, + "epoch": 0.5701917874105693, + "flos": 11473352392320.0, + "grad_norm": 3.8182132915874853, + "language_loss": 0.84310818, + "learning_rate": 1.6443535200384591e-06, + "loss": 0.86313701, + "num_input_tokens_seen": 102339170, + "step": 4742, + "time_per_iteration": 2.683846950531006 + }, + { + "auxiliary_loss_clip": 0.01137902, + "auxiliary_loss_mlp": 0.01084777, + "balance_loss_clip": 1.02727365, + "balance_loss_mlp": 1.00452507, + "epoch": 0.5703120303012085, + "flos": 21761759018880.0, + "grad_norm": 1.8093087580234755, + "language_loss": 0.70708394, + "learning_rate": 1.6435869902897827e-06, + "loss": 0.72931075, + "num_input_tokens_seen": 102357750, + "step": 4743, + "time_per_iteration": 3.5755045413970947 + }, + { + "auxiliary_loss_clip": 0.01095448, + "auxiliary_loss_mlp": 0.01079424, + "balance_loss_clip": 1.02374554, + "balance_loss_mlp": 1.00007856, + "epoch": 0.5704322731918475, + "flos": 56746258513920.0, + "grad_norm": 0.7895494337447531, + "language_loss": 0.62062258, + "learning_rate": 1.6428205146086764e-06, + "loss": 0.6423713, + "num_input_tokens_seen": 102419730, + "step": 4744, + "time_per_iteration": 4.328706741333008 + }, + { + "auxiliary_loss_clip": 0.01122203, + "auxiliary_loss_mlp": 0.01086159, + "balance_loss_clip": 1.02761984, + "balance_loss_mlp": 1.00566876, + "epoch": 0.5705525160824866, + "flos": 20741141975040.0, + "grad_norm": 1.5597047789996095, + "language_loss": 0.70681715, + "learning_rate": 1.6420540931114142e-06, + "loss": 0.72890067, + "num_input_tokens_seen": 102440320, + "step": 4745, + "time_per_iteration": 2.699556827545166 + }, + { + "auxiliary_loss_clip": 0.0111853, + "auxiliary_loss_mlp": 0.01084661, + "balance_loss_clip": 1.02524221, + "balance_loss_mlp": 1.0042187, + "epoch": 0.5706727589731257, + "flos": 18771262254720.0, + "grad_norm": 1.7938909868001613, + "language_loss": 0.79347765, + "learning_rate": 1.6412877259142616e-06, + "loss": 0.81550956, + "num_input_tokens_seen": 102460240, + "step": 4746, + "time_per_iteration": 3.697289228439331 + }, + { + "auxiliary_loss_clip": 0.01119833, + "auxiliary_loss_mlp": 0.0108521, + "balance_loss_clip": 1.02622938, + "balance_loss_mlp": 1.0048151, + "epoch": 0.5707930018637648, + "flos": 27634733372160.0, + "grad_norm": 1.9835036592385664, + "language_loss": 0.73306692, + "learning_rate": 1.6405214131334757e-06, + "loss": 0.7551173, + "num_input_tokens_seen": 102478765, + "step": 4747, + "time_per_iteration": 2.8032047748565674 + }, + { + "auxiliary_loss_clip": 0.01098164, + "auxiliary_loss_mlp": 0.01084919, + "balance_loss_clip": 1.02453279, + "balance_loss_mlp": 1.00457239, + "epoch": 0.5709132447544039, + "flos": 27597673514880.0, + "grad_norm": 1.7764685184303466, + "language_loss": 0.79534507, + "learning_rate": 1.6397551548853052e-06, + "loss": 0.81717592, + "num_input_tokens_seen": 102496930, + "step": 4748, + "time_per_iteration": 2.8244264125823975 + }, + { + "auxiliary_loss_clip": 0.01116656, + "auxiliary_loss_mlp": 0.01085299, + "balance_loss_clip": 1.02488589, + "balance_loss_mlp": 1.00485659, + "epoch": 0.571033487645043, + "flos": 21686095019520.0, + "grad_norm": 3.917896016116001, + "language_loss": 0.70703673, + "learning_rate": 1.6389889512859917e-06, + "loss": 0.72905624, + "num_input_tokens_seen": 102516590, + "step": 4749, + "time_per_iteration": 2.780320882797241 + }, + { + "auxiliary_loss_clip": 0.01102412, + "auxiliary_loss_mlp": 0.01079332, + "balance_loss_clip": 1.02231884, + "balance_loss_mlp": 1.00036812, + "epoch": 0.5711537305356821, + "flos": 70181445980160.0, + "grad_norm": 0.809503930791356, + "language_loss": 0.6034925, + "learning_rate": 1.638222802451767e-06, + "loss": 0.62530994, + "num_input_tokens_seen": 102578070, + "step": 4750, + "time_per_iteration": 4.251510143280029 + }, + { + "auxiliary_loss_clip": 0.0112814, + "auxiliary_loss_mlp": 0.01084009, + "balance_loss_clip": 1.02647614, + "balance_loss_mlp": 1.00375736, + "epoch": 0.5712739734263211, + "flos": 24717494396160.0, + "grad_norm": 1.5971073748845945, + "language_loss": 0.75196868, + "learning_rate": 1.6374567084988561e-06, + "loss": 0.77409017, + "num_input_tokens_seen": 102599255, + "step": 4751, + "time_per_iteration": 2.7598822116851807 + }, + { + "auxiliary_loss_clip": 0.01104946, + "auxiliary_loss_mlp": 0.01085008, + "balance_loss_clip": 1.02826667, + "balance_loss_mlp": 1.00456583, + "epoch": 0.5713942163169603, + "flos": 26578169792640.0, + "grad_norm": 1.7567486040511708, + "language_loss": 0.76450133, + "learning_rate": 1.6366906695434738e-06, + "loss": 0.78640091, + "num_input_tokens_seen": 102621775, + "step": 4752, + "time_per_iteration": 2.8146235942840576 + }, + { + "auxiliary_loss_clip": 0.01129173, + "auxiliary_loss_mlp": 0.01083873, + "balance_loss_clip": 1.02727747, + "balance_loss_mlp": 1.00366914, + "epoch": 0.5715144592075994, + "flos": 21142443697920.0, + "grad_norm": 1.7765699500506338, + "language_loss": 0.85615003, + "learning_rate": 1.6359246857018275e-06, + "loss": 0.87828052, + "num_input_tokens_seen": 102639305, + "step": 4753, + "time_per_iteration": 2.7047841548919678 + }, + { + "auxiliary_loss_clip": 0.01088627, + "auxiliary_loss_mlp": 0.01084349, + "balance_loss_clip": 1.02746356, + "balance_loss_mlp": 1.00400162, + "epoch": 0.5716347020982384, + "flos": 23330265189120.0, + "grad_norm": 1.8728072866532672, + "language_loss": 0.78378034, + "learning_rate": 1.6351587570901178e-06, + "loss": 0.80551004, + "num_input_tokens_seen": 102659430, + "step": 4754, + "time_per_iteration": 2.882568359375 + }, + { + "auxiliary_loss_clip": 0.01103874, + "auxiliary_loss_mlp": 0.01085823, + "balance_loss_clip": 1.02175081, + "balance_loss_mlp": 1.00542879, + "epoch": 0.5717549449888776, + "flos": 17009555806080.0, + "grad_norm": 4.085555322025223, + "language_loss": 0.75533915, + "learning_rate": 1.634392883824534e-06, + "loss": 0.7772361, + "num_input_tokens_seen": 102671430, + "step": 4755, + "time_per_iteration": 2.7143173217773438 + }, + { + "auxiliary_loss_clip": 0.01078984, + "auxiliary_loss_mlp": 0.0108521, + "balance_loss_clip": 1.02456665, + "balance_loss_mlp": 1.00467253, + "epoch": 0.5718751878795166, + "flos": 35518130922240.0, + "grad_norm": 1.6105782016677352, + "language_loss": 0.67837155, + "learning_rate": 1.6336270660212595e-06, + "loss": 0.70001346, + "num_input_tokens_seen": 102693025, + "step": 4756, + "time_per_iteration": 3.055727005004883 + }, + { + "auxiliary_loss_clip": 0.01107237, + "auxiliary_loss_mlp": 0.01084855, + "balance_loss_clip": 1.02231205, + "balance_loss_mlp": 1.0044601, + "epoch": 0.5719954307701557, + "flos": 38613989255040.0, + "grad_norm": 2.486479803882744, + "language_loss": 0.66108364, + "learning_rate": 1.6328613037964676e-06, + "loss": 0.68300456, + "num_input_tokens_seen": 102716090, + "step": 4757, + "time_per_iteration": 2.9015395641326904 + }, + { + "auxiliary_loss_clip": 0.01127464, + "auxiliary_loss_mlp": 0.01084495, + "balance_loss_clip": 1.02553964, + "balance_loss_mlp": 1.00410032, + "epoch": 0.5721156736607949, + "flos": 20631111638400.0, + "grad_norm": 1.589066959348946, + "language_loss": 0.67977798, + "learning_rate": 1.6320955972663241e-06, + "loss": 0.70189756, + "num_input_tokens_seen": 102735685, + "step": 4758, + "time_per_iteration": 2.6636545658111572 + }, + { + "auxiliary_loss_clip": 0.01127431, + "auxiliary_loss_mlp": 0.01084885, + "balance_loss_clip": 1.02536142, + "balance_loss_mlp": 1.0045383, + "epoch": 0.5722359165514339, + "flos": 37415076076800.0, + "grad_norm": 1.8065475816146133, + "language_loss": 0.65322846, + "learning_rate": 1.6313299465469857e-06, + "loss": 0.67535162, + "num_input_tokens_seen": 102758415, + "step": 4759, + "time_per_iteration": 2.9912831783294678 + }, + { + "auxiliary_loss_clip": 0.01130617, + "auxiliary_loss_mlp": 0.01085322, + "balance_loss_clip": 1.02786064, + "balance_loss_mlp": 1.00487947, + "epoch": 0.572356159442073, + "flos": 21972877205760.0, + "grad_norm": 2.6232316386140884, + "language_loss": 0.79382646, + "learning_rate": 1.6305643517546014e-06, + "loss": 0.8159858, + "num_input_tokens_seen": 102773795, + "step": 4760, + "time_per_iteration": 2.6729564666748047 + }, + { + "auxiliary_loss_clip": 0.0113828, + "auxiliary_loss_mlp": 0.01085058, + "balance_loss_clip": 1.02783036, + "balance_loss_mlp": 1.00471139, + "epoch": 0.5724764023327121, + "flos": 19135540033920.0, + "grad_norm": 2.3652334184516035, + "language_loss": 0.84827471, + "learning_rate": 1.629798813005311e-06, + "loss": 0.87050813, + "num_input_tokens_seen": 102793515, + "step": 4761, + "time_per_iteration": 2.6990771293640137 + }, + { + "auxiliary_loss_clip": 0.01098768, + "auxiliary_loss_mlp": 0.01084843, + "balance_loss_clip": 1.02466607, + "balance_loss_mlp": 1.00449634, + "epoch": 0.5725966452233512, + "flos": 22819759142400.0, + "grad_norm": 2.2256255685304334, + "language_loss": 0.70543438, + "learning_rate": 1.6290333304152473e-06, + "loss": 0.72727048, + "num_input_tokens_seen": 102813390, + "step": 4762, + "time_per_iteration": 2.850510597229004 + }, + { + "auxiliary_loss_clip": 0.01111844, + "auxiliary_loss_mlp": 0.01085772, + "balance_loss_clip": 1.02170992, + "balance_loss_mlp": 1.00537765, + "epoch": 0.5727168881139902, + "flos": 41496610498560.0, + "grad_norm": 1.7363218172105406, + "language_loss": 0.56732094, + "learning_rate": 1.6282679041005314e-06, + "loss": 0.58929706, + "num_input_tokens_seen": 102838980, + "step": 4763, + "time_per_iteration": 2.8982481956481934 + }, + { + "auxiliary_loss_clip": 0.01119008, + "auxiliary_loss_mlp": 0.01084163, + "balance_loss_clip": 1.02583349, + "balance_loss_mlp": 1.00376809, + "epoch": 0.5728371310046293, + "flos": 14647675985280.0, + "grad_norm": 2.0145722277534914, + "language_loss": 0.86820471, + "learning_rate": 1.6275025341772789e-06, + "loss": 0.89023644, + "num_input_tokens_seen": 102855285, + "step": 4764, + "time_per_iteration": 2.6664297580718994 + }, + { + "auxiliary_loss_clip": 0.01120782, + "auxiliary_loss_mlp": 0.0108618, + "balance_loss_clip": 1.0265801, + "balance_loss_mlp": 1.00578523, + "epoch": 0.5729573738952685, + "flos": 21506613736320.0, + "grad_norm": 2.249956977737477, + "language_loss": 0.81955755, + "learning_rate": 1.626737220761596e-06, + "loss": 0.84162718, + "num_input_tokens_seen": 102872750, + "step": 4765, + "time_per_iteration": 2.700688600540161 + }, + { + "auxiliary_loss_clip": 0.01122306, + "auxiliary_loss_mlp": 0.01085463, + "balance_loss_clip": 1.02642167, + "balance_loss_mlp": 1.00497317, + "epoch": 0.5730776167859075, + "flos": 23621680229760.0, + "grad_norm": 2.0920124505917013, + "language_loss": 0.79373729, + "learning_rate": 1.62597196396958e-06, + "loss": 0.81581497, + "num_input_tokens_seen": 102890920, + "step": 4766, + "time_per_iteration": 2.664736747741699 + }, + { + "auxiliary_loss_clip": 0.01128305, + "auxiliary_loss_mlp": 0.01084699, + "balance_loss_clip": 1.02618551, + "balance_loss_mlp": 1.00425625, + "epoch": 0.5731978596765466, + "flos": 25739224761600.0, + "grad_norm": 1.8092633881099316, + "language_loss": 0.85651362, + "learning_rate": 1.6252067639173197e-06, + "loss": 0.87864363, + "num_input_tokens_seen": 102912830, + "step": 4767, + "time_per_iteration": 2.7463650703430176 + }, + { + "auxiliary_loss_clip": 0.01127704, + "auxiliary_loss_mlp": 0.01084254, + "balance_loss_clip": 1.02583551, + "balance_loss_mlp": 1.00390732, + "epoch": 0.5733181025671857, + "flos": 26359509749760.0, + "grad_norm": 1.7124905290630994, + "language_loss": 0.69835746, + "learning_rate": 1.6244416207208956e-06, + "loss": 0.72047698, + "num_input_tokens_seen": 102933765, + "step": 4768, + "time_per_iteration": 3.623972177505493 + }, + { + "auxiliary_loss_clip": 0.01102255, + "auxiliary_loss_mlp": 0.01085211, + "balance_loss_clip": 1.02332723, + "balance_loss_mlp": 1.00491142, + "epoch": 0.5734383454578248, + "flos": 29423874833280.0, + "grad_norm": 1.630448478053786, + "language_loss": 0.73383409, + "learning_rate": 1.6236765344963787e-06, + "loss": 0.75570869, + "num_input_tokens_seen": 102955025, + "step": 4769, + "time_per_iteration": 3.7712714672088623 + }, + { + "auxiliary_loss_clip": 0.01117242, + "auxiliary_loss_mlp": 0.01085043, + "balance_loss_clip": 1.02571893, + "balance_loss_mlp": 1.0047915, + "epoch": 0.5735585883484638, + "flos": 34969954487040.0, + "grad_norm": 3.303824472750592, + "language_loss": 0.69239867, + "learning_rate": 1.6229115053598322e-06, + "loss": 0.71442151, + "num_input_tokens_seen": 102976780, + "step": 4770, + "time_per_iteration": 2.8461005687713623 + }, + { + "auxiliary_loss_clip": 0.01129644, + "auxiliary_loss_mlp": 0.01084067, + "balance_loss_clip": 1.02859068, + "balance_loss_mlp": 1.00357735, + "epoch": 0.573678831239103, + "flos": 18770759464320.0, + "grad_norm": 1.822318758723773, + "language_loss": 0.71899629, + "learning_rate": 1.6221465334273108e-06, + "loss": 0.74113339, + "num_input_tokens_seen": 102995990, + "step": 4771, + "time_per_iteration": 2.712491750717163 + }, + { + "auxiliary_loss_clip": 0.01110286, + "auxiliary_loss_mlp": 0.01085691, + "balance_loss_clip": 1.02509081, + "balance_loss_mlp": 1.00524831, + "epoch": 0.5737990741297421, + "flos": 25702883176320.0, + "grad_norm": 2.084668252519496, + "language_loss": 0.61428702, + "learning_rate": 1.6213816188148593e-06, + "loss": 0.6362468, + "num_input_tokens_seen": 103014695, + "step": 4772, + "time_per_iteration": 3.8028578758239746 + }, + { + "auxiliary_loss_clip": 0.0111452, + "auxiliary_loss_mlp": 0.01085354, + "balance_loss_clip": 1.02323079, + "balance_loss_mlp": 1.00500727, + "epoch": 0.5739193170203811, + "flos": 27269234530560.0, + "grad_norm": 1.9519491844895718, + "language_loss": 0.77199447, + "learning_rate": 1.6206167616385162e-06, + "loss": 0.79399318, + "num_input_tokens_seen": 103035760, + "step": 4773, + "time_per_iteration": 2.7389230728149414 + }, + { + "auxiliary_loss_clip": 0.01120698, + "auxiliary_loss_mlp": 0.01086301, + "balance_loss_clip": 1.02773058, + "balance_loss_mlp": 1.00581038, + "epoch": 0.5740395599110203, + "flos": 12239721993600.0, + "grad_norm": 2.076960981497223, + "language_loss": 0.73638105, + "learning_rate": 1.6198519620143078e-06, + "loss": 0.75845104, + "num_input_tokens_seen": 103052915, + "step": 4774, + "time_per_iteration": 2.759106159210205 + }, + { + "auxiliary_loss_clip": 0.01110175, + "auxiliary_loss_mlp": 0.01085867, + "balance_loss_clip": 1.02550197, + "balance_loss_mlp": 1.00556779, + "epoch": 0.5741598028016593, + "flos": 25921399564800.0, + "grad_norm": 1.653619672272277, + "language_loss": 0.78043497, + "learning_rate": 1.6190872200582546e-06, + "loss": 0.80239534, + "num_input_tokens_seen": 103074655, + "step": 4775, + "time_per_iteration": 3.728393316268921 + }, + { + "auxiliary_loss_clip": 0.01121179, + "auxiliary_loss_mlp": 0.00873049, + "balance_loss_clip": 1.02715123, + "balance_loss_mlp": 1.00013173, + "epoch": 0.5742800456922984, + "flos": 19244133826560.0, + "grad_norm": 2.0276646973021495, + "language_loss": 0.7793479, + "learning_rate": 1.6183225358863676e-06, + "loss": 0.79929018, + "num_input_tokens_seen": 103091550, + "step": 4776, + "time_per_iteration": 2.726079225540161 + }, + { + "auxiliary_loss_clip": 0.01111959, + "auxiliary_loss_mlp": 0.0108428, + "balance_loss_clip": 1.02400327, + "balance_loss_mlp": 1.00383806, + "epoch": 0.5744002885829376, + "flos": 30920487932160.0, + "grad_norm": 2.1452259739242483, + "language_loss": 0.72077674, + "learning_rate": 1.617557909614648e-06, + "loss": 0.74273914, + "num_input_tokens_seen": 103110985, + "step": 4777, + "time_per_iteration": 2.793196678161621 + }, + { + "auxiliary_loss_clip": 0.0110719, + "auxiliary_loss_mlp": 0.01084917, + "balance_loss_clip": 1.02434921, + "balance_loss_mlp": 1.00456953, + "epoch": 0.5745205314735766, + "flos": 23840017050240.0, + "grad_norm": 1.8236833156231556, + "language_loss": 0.86344516, + "learning_rate": 1.6167933413590899e-06, + "loss": 0.88536626, + "num_input_tokens_seen": 103129890, + "step": 4778, + "time_per_iteration": 2.8031058311462402 + }, + { + "auxiliary_loss_clip": 0.01129905, + "auxiliary_loss_mlp": 0.01086113, + "balance_loss_clip": 1.02757883, + "balance_loss_mlp": 1.00571799, + "epoch": 0.5746407743642157, + "flos": 12311902373760.0, + "grad_norm": 2.080350395998161, + "language_loss": 0.90638399, + "learning_rate": 1.6160288312356773e-06, + "loss": 0.92854416, + "num_input_tokens_seen": 103147020, + "step": 4779, + "time_per_iteration": 2.610320806503296 + }, + { + "auxiliary_loss_clip": 0.01128939, + "auxiliary_loss_mlp": 0.01084834, + "balance_loss_clip": 1.02634108, + "balance_loss_mlp": 1.00439143, + "epoch": 0.5747610172548548, + "flos": 24133658734080.0, + "grad_norm": 1.5836923981230304, + "language_loss": 0.82092869, + "learning_rate": 1.6152643793603857e-06, + "loss": 0.84306639, + "num_input_tokens_seen": 103167370, + "step": 4780, + "time_per_iteration": 2.7216663360595703 + }, + { + "auxiliary_loss_clip": 0.01138063, + "auxiliary_loss_mlp": 0.01084493, + "balance_loss_clip": 1.02750099, + "balance_loss_mlp": 1.00409853, + "epoch": 0.5748812601454939, + "flos": 25408451393280.0, + "grad_norm": 1.662899404351643, + "language_loss": 0.87902594, + "learning_rate": 1.6144999858491815e-06, + "loss": 0.90125144, + "num_input_tokens_seen": 103186000, + "step": 4781, + "time_per_iteration": 2.6496095657348633 + }, + { + "auxiliary_loss_clip": 0.01118714, + "auxiliary_loss_mlp": 0.01085582, + "balance_loss_clip": 1.02562106, + "balance_loss_mlp": 1.00509179, + "epoch": 0.575001503036133, + "flos": 30624942827520.0, + "grad_norm": 1.630458081221427, + "language_loss": 0.85509169, + "learning_rate": 1.6137356508180232e-06, + "loss": 0.87713468, + "num_input_tokens_seen": 103207710, + "step": 4782, + "time_per_iteration": 2.826857566833496 + }, + { + "auxiliary_loss_clip": 0.01136371, + "auxiliary_loss_mlp": 0.00873166, + "balance_loss_clip": 1.02583444, + "balance_loss_mlp": 1.000139, + "epoch": 0.5751217459267721, + "flos": 21726566668800.0, + "grad_norm": 1.6184548533371632, + "language_loss": 0.81259936, + "learning_rate": 1.6129713743828593e-06, + "loss": 0.83269471, + "num_input_tokens_seen": 103226720, + "step": 4783, + "time_per_iteration": 2.735750675201416 + }, + { + "auxiliary_loss_clip": 0.01119067, + "auxiliary_loss_mlp": 0.01085006, + "balance_loss_clip": 1.02476454, + "balance_loss_mlp": 1.00465882, + "epoch": 0.5752419888174112, + "flos": 21651620941440.0, + "grad_norm": 1.3544163429693945, + "language_loss": 0.75576526, + "learning_rate": 1.6122071566596306e-06, + "loss": 0.77780604, + "num_input_tokens_seen": 103246995, + "step": 4784, + "time_per_iteration": 2.6896960735321045 + }, + { + "auxiliary_loss_clip": 0.01128653, + "auxiliary_loss_mlp": 0.01085367, + "balance_loss_clip": 1.02670515, + "balance_loss_mlp": 1.00487721, + "epoch": 0.5753622317080502, + "flos": 17775997234560.0, + "grad_norm": 4.223010492596148, + "language_loss": 0.83333397, + "learning_rate": 1.6114429977642674e-06, + "loss": 0.85547411, + "num_input_tokens_seen": 103261500, + "step": 4785, + "time_per_iteration": 2.68985915184021 + }, + { + "auxiliary_loss_clip": 0.01128955, + "auxiliary_loss_mlp": 0.01087201, + "balance_loss_clip": 1.02841139, + "balance_loss_mlp": 1.00666344, + "epoch": 0.5754824745986894, + "flos": 19789616741760.0, + "grad_norm": 1.7206363620111182, + "language_loss": 0.73599535, + "learning_rate": 1.6106788978126926e-06, + "loss": 0.7581569, + "num_input_tokens_seen": 103280475, + "step": 4786, + "time_per_iteration": 2.714580535888672 + }, + { + "auxiliary_loss_clip": 0.01099291, + "auxiliary_loss_mlp": 0.01085667, + "balance_loss_clip": 1.02307284, + "balance_loss_mlp": 1.00527215, + "epoch": 0.5756027174893285, + "flos": 30985665160320.0, + "grad_norm": 2.5626934434938007, + "language_loss": 0.78739965, + "learning_rate": 1.6099148569208196e-06, + "loss": 0.80924922, + "num_input_tokens_seen": 103297695, + "step": 4787, + "time_per_iteration": 2.951477527618408 + }, + { + "auxiliary_loss_clip": 0.01112167, + "auxiliary_loss_mlp": 0.01085903, + "balance_loss_clip": 1.02147198, + "balance_loss_mlp": 1.00546074, + "epoch": 0.5757229603799675, + "flos": 28546864364160.0, + "grad_norm": 1.6294925714042188, + "language_loss": 0.62901926, + "learning_rate": 1.6091508752045523e-06, + "loss": 0.6509999, + "num_input_tokens_seen": 103318575, + "step": 4788, + "time_per_iteration": 2.8147459030151367 + }, + { + "auxiliary_loss_clip": 0.01108749, + "auxiliary_loss_mlp": 0.01083855, + "balance_loss_clip": 1.02395928, + "balance_loss_mlp": 1.00345993, + "epoch": 0.5758432032706067, + "flos": 22999024944000.0, + "grad_norm": 1.5607047296170073, + "language_loss": 0.86513674, + "learning_rate": 1.608386952779787e-06, + "loss": 0.88706279, + "num_input_tokens_seen": 103337945, + "step": 4789, + "time_per_iteration": 2.78294038772583 + }, + { + "auxiliary_loss_clip": 0.01119362, + "auxiliary_loss_mlp": 0.01085493, + "balance_loss_clip": 1.02590775, + "balance_loss_mlp": 1.00514615, + "epoch": 0.5759634461612457, + "flos": 25739727552000.0, + "grad_norm": 1.5347358987302981, + "language_loss": 0.7457999, + "learning_rate": 1.6076230897624098e-06, + "loss": 0.76784843, + "num_input_tokens_seen": 103360150, + "step": 4790, + "time_per_iteration": 2.8625295162200928 + }, + { + "auxiliary_loss_clip": 0.01129065, + "auxiliary_loss_mlp": 0.01085364, + "balance_loss_clip": 1.02665615, + "balance_loss_mlp": 1.00482666, + "epoch": 0.5760836890518848, + "flos": 30591761639040.0, + "grad_norm": 2.6192653131424657, + "language_loss": 0.77589285, + "learning_rate": 1.6068592862682974e-06, + "loss": 0.79803711, + "num_input_tokens_seen": 103378305, + "step": 4791, + "time_per_iteration": 2.738913059234619 + }, + { + "auxiliary_loss_clip": 0.01118393, + "auxiliary_loss_mlp": 0.01085191, + "balance_loss_clip": 1.0253799, + "balance_loss_mlp": 1.00479674, + "epoch": 0.576203931942524, + "flos": 36538963447680.0, + "grad_norm": 1.8601146701480833, + "language_loss": 0.73332381, + "learning_rate": 1.6060955424133187e-06, + "loss": 0.75535971, + "num_input_tokens_seen": 103399230, + "step": 4792, + "time_per_iteration": 2.9029455184936523 + }, + { + "auxiliary_loss_clip": 0.01127216, + "auxiliary_loss_mlp": 0.01085499, + "balance_loss_clip": 1.02607465, + "balance_loss_mlp": 1.00505626, + "epoch": 0.576324174833163, + "flos": 25516937445120.0, + "grad_norm": 1.7225056742633302, + "language_loss": 0.89586747, + "learning_rate": 1.6053318583133332e-06, + "loss": 0.91799468, + "num_input_tokens_seen": 103420100, + "step": 4793, + "time_per_iteration": 3.6194839477539062 + }, + { + "auxiliary_loss_clip": 0.0112656, + "auxiliary_loss_mlp": 0.01084268, + "balance_loss_clip": 1.02528453, + "balance_loss_mlp": 1.00377834, + "epoch": 0.5764444177238021, + "flos": 25119262995840.0, + "grad_norm": 3.762053132331828, + "language_loss": 0.75266755, + "learning_rate": 1.6045682340841907e-06, + "loss": 0.77477574, + "num_input_tokens_seen": 103439025, + "step": 4794, + "time_per_iteration": 2.731254816055298 + }, + { + "auxiliary_loss_clip": 0.01094039, + "auxiliary_loss_mlp": 0.00873267, + "balance_loss_clip": 1.02234221, + "balance_loss_mlp": 1.00190532, + "epoch": 0.5765646606144411, + "flos": 62212687758720.0, + "grad_norm": 0.7469219855360273, + "language_loss": 0.58016664, + "learning_rate": 1.6038046698417336e-06, + "loss": 0.59983969, + "num_input_tokens_seen": 103499920, + "step": 4795, + "time_per_iteration": 4.3012495040893555 + }, + { + "auxiliary_loss_clip": 0.01129813, + "auxiliary_loss_mlp": 0.01085034, + "balance_loss_clip": 1.02733612, + "balance_loss_mlp": 1.00463891, + "epoch": 0.5766849035050803, + "flos": 25118760205440.0, + "grad_norm": 2.7512582818235045, + "language_loss": 0.69001347, + "learning_rate": 1.6030411657017919e-06, + "loss": 0.7121619, + "num_input_tokens_seen": 103519575, + "step": 4796, + "time_per_iteration": 2.6204023361206055 + }, + { + "auxiliary_loss_clip": 0.01130948, + "auxiliary_loss_mlp": 0.01084992, + "balance_loss_clip": 1.02800369, + "balance_loss_mlp": 1.00469267, + "epoch": 0.5768051463957193, + "flos": 15991093578240.0, + "grad_norm": 2.2664594165431717, + "language_loss": 0.84473813, + "learning_rate": 1.6022777217801903e-06, + "loss": 0.86689746, + "num_input_tokens_seen": 103536530, + "step": 4797, + "time_per_iteration": 3.5880470275878906 + }, + { + "auxiliary_loss_clip": 0.01104459, + "auxiliary_loss_mlp": 0.01084708, + "balance_loss_clip": 1.0263195, + "balance_loss_mlp": 1.00436139, + "epoch": 0.5769253892863584, + "flos": 22163635359360.0, + "grad_norm": 2.2399530097315585, + "language_loss": 0.73537493, + "learning_rate": 1.601514338192742e-06, + "loss": 0.75726664, + "num_input_tokens_seen": 103556460, + "step": 4798, + "time_per_iteration": 2.770975112915039 + }, + { + "auxiliary_loss_clip": 0.01136776, + "auxiliary_loss_mlp": 0.01083406, + "balance_loss_clip": 1.02672482, + "balance_loss_mlp": 1.00315428, + "epoch": 0.5770456321769976, + "flos": 22856388036480.0, + "grad_norm": 2.5461573597639915, + "language_loss": 0.71830279, + "learning_rate": 1.6007510150552514e-06, + "loss": 0.74050462, + "num_input_tokens_seen": 103574520, + "step": 4799, + "time_per_iteration": 2.577629566192627 + }, + { + "auxiliary_loss_clip": 0.01129793, + "auxiliary_loss_mlp": 0.01084998, + "balance_loss_clip": 1.02669024, + "balance_loss_mlp": 1.00441265, + "epoch": 0.5771658750676366, + "flos": 46353672489600.0, + "grad_norm": 1.4636116950940967, + "language_loss": 0.62079346, + "learning_rate": 1.599987752483515e-06, + "loss": 0.6429413, + "num_input_tokens_seen": 103598965, + "step": 4800, + "time_per_iteration": 3.765812635421753 + }, + { + "auxiliary_loss_clip": 0.01112768, + "auxiliary_loss_mlp": 0.0108386, + "balance_loss_clip": 1.02672756, + "balance_loss_mlp": 1.00346494, + "epoch": 0.5772861179582757, + "flos": 22159972172160.0, + "grad_norm": 1.5509102122126759, + "language_loss": 0.67831326, + "learning_rate": 1.5992245505933184e-06, + "loss": 0.70027953, + "num_input_tokens_seen": 103618665, + "step": 4801, + "time_per_iteration": 2.7335429191589355 + }, + { + "auxiliary_loss_clip": 0.01139271, + "auxiliary_loss_mlp": 0.0108404, + "balance_loss_clip": 1.02838922, + "balance_loss_mlp": 1.0036931, + "epoch": 0.5774063608489148, + "flos": 31248926916480.0, + "grad_norm": 1.8696247867496687, + "language_loss": 0.71432549, + "learning_rate": 1.5984614095004388e-06, + "loss": 0.73655862, + "num_input_tokens_seen": 103639800, + "step": 4802, + "time_per_iteration": 2.7297232151031494 + }, + { + "auxiliary_loss_clip": 0.01128774, + "auxiliary_loss_mlp": 0.0108452, + "balance_loss_clip": 1.02583265, + "balance_loss_mlp": 1.00412536, + "epoch": 0.5775266037395539, + "flos": 22527123039360.0, + "grad_norm": 1.9491290502605016, + "language_loss": 0.81394941, + "learning_rate": 1.5976983293206438e-06, + "loss": 0.83608234, + "num_input_tokens_seen": 103655605, + "step": 4803, + "time_per_iteration": 2.726175546646118 + }, + { + "auxiliary_loss_clip": 0.01120809, + "auxiliary_loss_mlp": 0.01085039, + "balance_loss_clip": 1.02586031, + "balance_loss_mlp": 1.00469232, + "epoch": 0.577646846630193, + "flos": 21068790860160.0, + "grad_norm": 1.771543125767011, + "language_loss": 0.71081889, + "learning_rate": 1.5969353101696928e-06, + "loss": 0.73287737, + "num_input_tokens_seen": 103674045, + "step": 4804, + "time_per_iteration": 2.7540979385375977 + }, + { + "auxiliary_loss_clip": 0.01127605, + "auxiliary_loss_mlp": 0.0108453, + "balance_loss_clip": 1.02570629, + "balance_loss_mlp": 1.00423026, + "epoch": 0.5777670895208321, + "flos": 29714284293120.0, + "grad_norm": 1.6043910198998776, + "language_loss": 0.79770446, + "learning_rate": 1.5961723521633341e-06, + "loss": 0.81982577, + "num_input_tokens_seen": 103695285, + "step": 4805, + "time_per_iteration": 2.8849668502807617 + }, + { + "auxiliary_loss_clip": 0.01104447, + "auxiliary_loss_mlp": 0.01085557, + "balance_loss_clip": 1.02651227, + "balance_loss_mlp": 1.00516248, + "epoch": 0.5778873324114712, + "flos": 19500428344320.0, + "grad_norm": 2.28475698655222, + "language_loss": 0.90747082, + "learning_rate": 1.5954094554173097e-06, + "loss": 0.92937088, + "num_input_tokens_seen": 103713275, + "step": 4806, + "time_per_iteration": 2.6774020195007324 + }, + { + "auxiliary_loss_clip": 0.01121063, + "auxiliary_loss_mlp": 0.01084456, + "balance_loss_clip": 1.02700877, + "balance_loss_mlp": 1.00420403, + "epoch": 0.5780075753021102, + "flos": 14136846716160.0, + "grad_norm": 1.9157751397743716, + "language_loss": 0.79154772, + "learning_rate": 1.5946466200473482e-06, + "loss": 0.81360292, + "num_input_tokens_seen": 103731185, + "step": 4807, + "time_per_iteration": 2.814054250717163 + }, + { + "auxiliary_loss_clip": 0.01122396, + "auxiliary_loss_mlp": 0.0108421, + "balance_loss_clip": 1.02812123, + "balance_loss_mlp": 1.00381494, + "epoch": 0.5781278181927494, + "flos": 15262178883840.0, + "grad_norm": 1.6877636897047825, + "language_loss": 0.83251327, + "learning_rate": 1.5938838461691723e-06, + "loss": 0.85457933, + "num_input_tokens_seen": 103748095, + "step": 4808, + "time_per_iteration": 2.6349143981933594 + }, + { + "auxiliary_loss_clip": 0.01139097, + "auxiliary_loss_mlp": 0.01084271, + "balance_loss_clip": 1.02866578, + "balance_loss_mlp": 1.00378108, + "epoch": 0.5782480610833884, + "flos": 16726831856640.0, + "grad_norm": 2.1542724684625845, + "language_loss": 0.82908982, + "learning_rate": 1.593121133898494e-06, + "loss": 0.85132349, + "num_input_tokens_seen": 103765300, + "step": 4809, + "time_per_iteration": 2.8009142875671387 + }, + { + "auxiliary_loss_clip": 0.0112909, + "auxiliary_loss_mlp": 0.01085305, + "balance_loss_clip": 1.02630067, + "balance_loss_mlp": 1.00486255, + "epoch": 0.5783683039740275, + "flos": 25482140144640.0, + "grad_norm": 1.8552843894476672, + "language_loss": 0.79088974, + "learning_rate": 1.592358483351016e-06, + "loss": 0.8130337, + "num_input_tokens_seen": 103785475, + "step": 4810, + "time_per_iteration": 2.6770694255828857 + }, + { + "auxiliary_loss_clip": 0.01126338, + "auxiliary_loss_mlp": 0.01083146, + "balance_loss_clip": 1.02565205, + "balance_loss_mlp": 1.00284624, + "epoch": 0.5784885468646667, + "flos": 18405835240320.0, + "grad_norm": 1.789827532378764, + "language_loss": 0.721506, + "learning_rate": 1.5915958946424326e-06, + "loss": 0.74360085, + "num_input_tokens_seen": 103804160, + "step": 4811, + "time_per_iteration": 2.7540102005004883 + }, + { + "auxiliary_loss_clip": 0.01105222, + "auxiliary_loss_mlp": 0.00873054, + "balance_loss_clip": 1.02152157, + "balance_loss_mlp": 1.0000807, + "epoch": 0.5786087897553057, + "flos": 46100717936640.0, + "grad_norm": 1.547895835538099, + "language_loss": 0.74308187, + "learning_rate": 1.5908333678884271e-06, + "loss": 0.76286465, + "num_input_tokens_seen": 103830580, + "step": 4812, + "time_per_iteration": 3.1514930725097656 + }, + { + "auxiliary_loss_clip": 0.01127511, + "auxiliary_loss_mlp": 0.01084489, + "balance_loss_clip": 1.02654874, + "balance_loss_mlp": 1.0040462, + "epoch": 0.5787290326459448, + "flos": 12385950261120.0, + "grad_norm": 1.8165888429479795, + "language_loss": 0.73893243, + "learning_rate": 1.5900709032046743e-06, + "loss": 0.76105243, + "num_input_tokens_seen": 103848655, + "step": 4813, + "time_per_iteration": 2.645296335220337 + }, + { + "auxiliary_loss_clip": 0.0111769, + "auxiliary_loss_mlp": 0.01085385, + "balance_loss_clip": 1.02561641, + "balance_loss_mlp": 1.00513315, + "epoch": 0.5788492755365839, + "flos": 23290332243840.0, + "grad_norm": 1.9374002074167715, + "language_loss": 0.77996814, + "learning_rate": 1.5893085007068391e-06, + "loss": 0.80199885, + "num_input_tokens_seen": 103866215, + "step": 4814, + "time_per_iteration": 2.732609748840332 + }, + { + "auxiliary_loss_clip": 0.01121023, + "auxiliary_loss_mlp": 0.01084739, + "balance_loss_clip": 1.0267086, + "balance_loss_mlp": 1.00443983, + "epoch": 0.578969518427223, + "flos": 24061047390720.0, + "grad_norm": 1.8726701678935336, + "language_loss": 0.70913625, + "learning_rate": 1.5885461605105786e-06, + "loss": 0.73119384, + "num_input_tokens_seen": 103887815, + "step": 4815, + "time_per_iteration": 2.740574359893799 + }, + { + "auxiliary_loss_clip": 0.01114072, + "auxiliary_loss_mlp": 0.01085814, + "balance_loss_clip": 1.0221324, + "balance_loss_mlp": 1.00541902, + "epoch": 0.579089761317862, + "flos": 21871825269120.0, + "grad_norm": 1.7797518969705333, + "language_loss": 0.7632153, + "learning_rate": 1.5877838827315375e-06, + "loss": 0.78521413, + "num_input_tokens_seen": 103906360, + "step": 4816, + "time_per_iteration": 2.774304151535034 + }, + { + "auxiliary_loss_clip": 0.01138011, + "auxiliary_loss_mlp": 0.01085615, + "balance_loss_clip": 1.02764654, + "balance_loss_mlp": 1.00502944, + "epoch": 0.5792100042085012, + "flos": 22929681738240.0, + "grad_norm": 1.8633702070915876, + "language_loss": 0.7050578, + "learning_rate": 1.587021667485355e-06, + "loss": 0.72729409, + "num_input_tokens_seen": 103925730, + "step": 4817, + "time_per_iteration": 2.7069332599639893 + }, + { + "auxiliary_loss_clip": 0.01119864, + "auxiliary_loss_mlp": 0.01083722, + "balance_loss_clip": 1.02545476, + "balance_loss_mlp": 1.00342226, + "epoch": 0.5793302470991403, + "flos": 21470056669440.0, + "grad_norm": 2.3359316798052947, + "language_loss": 0.7841779, + "learning_rate": 1.5862595148876559e-06, + "loss": 0.80621374, + "num_input_tokens_seen": 103945835, + "step": 4818, + "time_per_iteration": 3.6686322689056396 + }, + { + "auxiliary_loss_clip": 0.01096372, + "auxiliary_loss_mlp": 0.01084452, + "balance_loss_clip": 1.02201498, + "balance_loss_mlp": 1.00400949, + "epoch": 0.5794504899897793, + "flos": 12711013367040.0, + "grad_norm": 2.363414743569235, + "language_loss": 0.76650661, + "learning_rate": 1.58549742505406e-06, + "loss": 0.78831488, + "num_input_tokens_seen": 103960580, + "step": 4819, + "time_per_iteration": 2.8156731128692627 + }, + { + "auxiliary_loss_clip": 0.01136478, + "auxiliary_loss_mlp": 0.01084125, + "balance_loss_clip": 1.02625871, + "balance_loss_mlp": 1.00358677, + "epoch": 0.5795707328804185, + "flos": 14867054300160.0, + "grad_norm": 2.21926036761811, + "language_loss": 0.75414735, + "learning_rate": 1.5847353981001747e-06, + "loss": 0.77635336, + "num_input_tokens_seen": 103977760, + "step": 4820, + "time_per_iteration": 3.5965826511383057 + }, + { + "auxiliary_loss_clip": 0.01119851, + "auxiliary_loss_mlp": 0.01085615, + "balance_loss_clip": 1.02560782, + "balance_loss_mlp": 1.0053159, + "epoch": 0.5796909757710575, + "flos": 36430046432640.0, + "grad_norm": 1.6788422433641004, + "language_loss": 0.69826412, + "learning_rate": 1.5839734341415993e-06, + "loss": 0.72031873, + "num_input_tokens_seen": 103999960, + "step": 4821, + "time_per_iteration": 2.8788902759552 + }, + { + "auxiliary_loss_clip": 0.01121845, + "auxiliary_loss_mlp": 0.01083559, + "balance_loss_clip": 1.02709711, + "balance_loss_mlp": 1.00335455, + "epoch": 0.5798112186616966, + "flos": 23039891642880.0, + "grad_norm": 1.7065227107951382, + "language_loss": 0.7668311, + "learning_rate": 1.5832115332939238e-06, + "loss": 0.78888512, + "num_input_tokens_seen": 104018400, + "step": 4822, + "time_per_iteration": 3.7532055377960205 + }, + { + "auxiliary_loss_clip": 0.01129031, + "auxiliary_loss_mlp": 0.01085305, + "balance_loss_clip": 1.02673745, + "balance_loss_mlp": 1.0049578, + "epoch": 0.5799314615523358, + "flos": 16652604401280.0, + "grad_norm": 1.7433400668950496, + "language_loss": 0.74566513, + "learning_rate": 1.5824496956727272e-06, + "loss": 0.76780856, + "num_input_tokens_seen": 104035605, + "step": 4823, + "time_per_iteration": 2.713824510574341 + }, + { + "auxiliary_loss_clip": 0.01116407, + "auxiliary_loss_mlp": 0.01084236, + "balance_loss_clip": 1.02355051, + "balance_loss_mlp": 1.00384116, + "epoch": 0.5800517044429748, + "flos": 20485673470080.0, + "grad_norm": 1.6794814662148747, + "language_loss": 0.72964263, + "learning_rate": 1.5816879213935797e-06, + "loss": 0.75164908, + "num_input_tokens_seen": 104054415, + "step": 4824, + "time_per_iteration": 2.847937822341919 + }, + { + "auxiliary_loss_clip": 0.01128233, + "auxiliary_loss_mlp": 0.01084897, + "balance_loss_clip": 1.02718484, + "balance_loss_mlp": 1.00459731, + "epoch": 0.5801719473336139, + "flos": 31538258968320.0, + "grad_norm": 1.6339994930110668, + "language_loss": 0.79512966, + "learning_rate": 1.5809262105720416e-06, + "loss": 0.81726092, + "num_input_tokens_seen": 104075455, + "step": 4825, + "time_per_iteration": 2.74165415763855 + }, + { + "auxiliary_loss_clip": 0.01136065, + "auxiliary_loss_mlp": 0.01083646, + "balance_loss_clip": 1.02608407, + "balance_loss_mlp": 1.00339484, + "epoch": 0.580292190224253, + "flos": 20375966355840.0, + "grad_norm": 1.508938953005672, + "language_loss": 0.79620546, + "learning_rate": 1.5801645633236644e-06, + "loss": 0.81840253, + "num_input_tokens_seen": 104096440, + "step": 4826, + "time_per_iteration": 3.5767271518707275 + }, + { + "auxiliary_loss_clip": 0.01118235, + "auxiliary_loss_mlp": 0.0108644, + "balance_loss_clip": 1.02488887, + "balance_loss_mlp": 1.0060935, + "epoch": 0.5804124331148921, + "flos": 26615373304320.0, + "grad_norm": 1.853056745237817, + "language_loss": 0.77558923, + "learning_rate": 1.579402979763989e-06, + "loss": 0.79763603, + "num_input_tokens_seen": 104116775, + "step": 4827, + "time_per_iteration": 2.7740981578826904 + }, + { + "auxiliary_loss_clip": 0.01077944, + "auxiliary_loss_mlp": 0.01084311, + "balance_loss_clip": 1.02497554, + "balance_loss_mlp": 1.00405908, + "epoch": 0.5805326760055312, + "flos": 13478496289920.0, + "grad_norm": 2.098509836734173, + "language_loss": 0.81412756, + "learning_rate": 1.578641460008548e-06, + "loss": 0.8357501, + "num_input_tokens_seen": 104134510, + "step": 4828, + "time_per_iteration": 2.7751433849334717 + }, + { + "auxiliary_loss_clip": 0.01126108, + "auxiliary_loss_mlp": 0.01084614, + "balance_loss_clip": 1.0247395, + "balance_loss_mlp": 1.00431418, + "epoch": 0.5806529188961702, + "flos": 12091374823680.0, + "grad_norm": 1.9664748320176744, + "language_loss": 0.67934418, + "learning_rate": 1.5778800041728613e-06, + "loss": 0.70145142, + "num_input_tokens_seen": 104150800, + "step": 4829, + "time_per_iteration": 2.6760759353637695 + }, + { + "auxiliary_loss_clip": 0.01125558, + "auxiliary_loss_mlp": 0.01084739, + "balance_loss_clip": 1.025244, + "balance_loss_mlp": 1.00439203, + "epoch": 0.5807731617868094, + "flos": 26214107495040.0, + "grad_norm": 1.5829657140752422, + "language_loss": 0.66280586, + "learning_rate": 1.577118612372443e-06, + "loss": 0.68490887, + "num_input_tokens_seen": 104172640, + "step": 4830, + "time_per_iteration": 2.695314407348633 + }, + { + "auxiliary_loss_clip": 0.01118062, + "auxiliary_loss_mlp": 0.00873055, + "balance_loss_clip": 1.024773, + "balance_loss_mlp": 1.00009823, + "epoch": 0.5808934046774484, + "flos": 37962139190400.0, + "grad_norm": 2.3246634824941608, + "language_loss": 0.70502877, + "learning_rate": 1.5763572847227943e-06, + "loss": 0.72493994, + "num_input_tokens_seen": 104193525, + "step": 4831, + "time_per_iteration": 2.8457937240600586 + }, + { + "auxiliary_loss_clip": 0.01127506, + "auxiliary_loss_mlp": 0.0108486, + "balance_loss_clip": 1.02588725, + "balance_loss_mlp": 1.00465584, + "epoch": 0.5810136475680875, + "flos": 20485853038080.0, + "grad_norm": 2.5906774170575346, + "language_loss": 0.81379378, + "learning_rate": 1.5755960213394091e-06, + "loss": 0.83591747, + "num_input_tokens_seen": 104210625, + "step": 4832, + "time_per_iteration": 2.627579689025879 + }, + { + "auxiliary_loss_clip": 0.0109808, + "auxiliary_loss_mlp": 0.01085959, + "balance_loss_clip": 1.02841473, + "balance_loss_mlp": 1.00565982, + "epoch": 0.5811338904587267, + "flos": 17530153574400.0, + "grad_norm": 1.8888004086204717, + "language_loss": 0.78367311, + "learning_rate": 1.5748348223377703e-06, + "loss": 0.8055135, + "num_input_tokens_seen": 104228180, + "step": 4833, + "time_per_iteration": 2.88063645362854 + }, + { + "auxiliary_loss_clip": 0.01119148, + "auxiliary_loss_mlp": 0.01084448, + "balance_loss_clip": 1.02613139, + "balance_loss_mlp": 1.00419641, + "epoch": 0.5812541333493657, + "flos": 19458017360640.0, + "grad_norm": 1.6433741275524825, + "language_loss": 0.78115988, + "learning_rate": 1.5740736878333507e-06, + "loss": 0.80319583, + "num_input_tokens_seen": 104246020, + "step": 4834, + "time_per_iteration": 2.6776320934295654 + }, + { + "auxiliary_loss_clip": 0.01119074, + "auxiliary_loss_mlp": 0.01085312, + "balance_loss_clip": 1.02502871, + "balance_loss_mlp": 1.00501227, + "epoch": 0.5813743762400048, + "flos": 20594949621120.0, + "grad_norm": 2.130830521205718, + "language_loss": 0.77606505, + "learning_rate": 1.5733126179416143e-06, + "loss": 0.79810894, + "num_input_tokens_seen": 104260505, + "step": 4835, + "time_per_iteration": 2.7391438484191895 + }, + { + "auxiliary_loss_clip": 0.01127179, + "auxiliary_loss_mlp": 0.010851, + "balance_loss_clip": 1.0258379, + "balance_loss_mlp": 1.004848, + "epoch": 0.5814946191306439, + "flos": 33178227246720.0, + "grad_norm": 2.0588823214611818, + "language_loss": 0.72284985, + "learning_rate": 1.5725516127780137e-06, + "loss": 0.74497271, + "num_input_tokens_seen": 104282640, + "step": 4836, + "time_per_iteration": 2.7891933917999268 + }, + { + "auxiliary_loss_clip": 0.01129151, + "auxiliary_loss_mlp": 0.01084859, + "balance_loss_clip": 1.02598929, + "balance_loss_mlp": 1.00441647, + "epoch": 0.581614862021283, + "flos": 16143283503360.0, + "grad_norm": 2.0682430467085577, + "language_loss": 0.88270307, + "learning_rate": 1.5717906724579943e-06, + "loss": 0.90484309, + "num_input_tokens_seen": 104299700, + "step": 4837, + "time_per_iteration": 2.73390531539917 + }, + { + "auxiliary_loss_clip": 0.01114602, + "auxiliary_loss_mlp": 0.01086351, + "balance_loss_clip": 1.02783167, + "balance_loss_mlp": 1.00600398, + "epoch": 0.581735104911922, + "flos": 33802642298880.0, + "grad_norm": 2.2095750672386503, + "language_loss": 0.67982352, + "learning_rate": 1.571029797096989e-06, + "loss": 0.70183301, + "num_input_tokens_seen": 104320805, + "step": 4838, + "time_per_iteration": 2.976712942123413 + }, + { + "auxiliary_loss_clip": 0.01136687, + "auxiliary_loss_mlp": 0.01084977, + "balance_loss_clip": 1.02654338, + "balance_loss_mlp": 1.00472534, + "epoch": 0.5818553478025612, + "flos": 23331163029120.0, + "grad_norm": 1.6182627636170073, + "language_loss": 0.78856665, + "learning_rate": 1.570268986810423e-06, + "loss": 0.81078327, + "num_input_tokens_seen": 104340700, + "step": 4839, + "time_per_iteration": 2.6680543422698975 + }, + { + "auxiliary_loss_clip": 0.01120511, + "auxiliary_loss_mlp": 0.01084705, + "balance_loss_clip": 1.02701414, + "balance_loss_mlp": 1.00445282, + "epoch": 0.5819755906932003, + "flos": 20996143603200.0, + "grad_norm": 1.852192048817806, + "language_loss": 0.74437666, + "learning_rate": 1.5695082417137096e-06, + "loss": 0.76642883, + "num_input_tokens_seen": 104358575, + "step": 4840, + "time_per_iteration": 2.8028366565704346 + }, + { + "auxiliary_loss_clip": 0.01118149, + "auxiliary_loss_mlp": 0.01084948, + "balance_loss_clip": 1.02440095, + "balance_loss_mlp": 1.0046488, + "epoch": 0.5820958335838393, + "flos": 21431668008960.0, + "grad_norm": 1.5269601916083693, + "language_loss": 0.75334215, + "learning_rate": 1.5687475619222539e-06, + "loss": 0.7753731, + "num_input_tokens_seen": 104378530, + "step": 4841, + "time_per_iteration": 2.797607898712158 + }, + { + "auxiliary_loss_clip": 0.01121564, + "auxiliary_loss_mlp": 0.01084317, + "balance_loss_clip": 1.02749205, + "balance_loss_mlp": 1.00387406, + "epoch": 0.5822160764744785, + "flos": 17967473660160.0, + "grad_norm": 2.0126194888193325, + "language_loss": 0.73167777, + "learning_rate": 1.5679869475514496e-06, + "loss": 0.75373662, + "num_input_tokens_seen": 104395465, + "step": 4842, + "time_per_iteration": 2.7309088706970215 + }, + { + "auxiliary_loss_clip": 0.01127191, + "auxiliary_loss_mlp": 0.01085106, + "balance_loss_clip": 1.02671075, + "balance_loss_mlp": 1.00466347, + "epoch": 0.5823363193651175, + "flos": 23033858158080.0, + "grad_norm": 2.2052644229164806, + "language_loss": 0.81318343, + "learning_rate": 1.567226398716682e-06, + "loss": 0.83530641, + "num_input_tokens_seen": 104415380, + "step": 4843, + "time_per_iteration": 2.6543807983398438 + }, + { + "auxiliary_loss_clip": 0.01119649, + "auxiliary_loss_mlp": 0.01084978, + "balance_loss_clip": 1.0256269, + "balance_loss_mlp": 1.00453532, + "epoch": 0.5824565622557566, + "flos": 32891840110080.0, + "grad_norm": 1.7307300801847043, + "language_loss": 0.61834019, + "learning_rate": 1.566465915533326e-06, + "loss": 0.64038646, + "num_input_tokens_seen": 104437410, + "step": 4844, + "time_per_iteration": 3.752833843231201 + }, + { + "auxiliary_loss_clip": 0.01127611, + "auxiliary_loss_mlp": 0.01083566, + "balance_loss_clip": 1.02657628, + "balance_loss_mlp": 1.00336194, + "epoch": 0.5825768051463958, + "flos": 22229674513920.0, + "grad_norm": 1.9413773272659414, + "language_loss": 0.8816942, + "learning_rate": 1.5657054981167458e-06, + "loss": 0.90380597, + "num_input_tokens_seen": 104456305, + "step": 4845, + "time_per_iteration": 3.5667073726654053 + }, + { + "auxiliary_loss_clip": 0.01130214, + "auxiliary_loss_mlp": 0.01084404, + "balance_loss_clip": 1.02794957, + "balance_loss_mlp": 1.00419962, + "epoch": 0.5826970480370348, + "flos": 28001561016960.0, + "grad_norm": 2.147593730363723, + "language_loss": 0.67900306, + "learning_rate": 1.5649451465822965e-06, + "loss": 0.70114923, + "num_input_tokens_seen": 104477695, + "step": 4846, + "time_per_iteration": 2.7061219215393066 + }, + { + "auxiliary_loss_clip": 0.01098418, + "auxiliary_loss_mlp": 0.01084479, + "balance_loss_clip": 1.02400732, + "balance_loss_mlp": 1.00422764, + "epoch": 0.5828172909276739, + "flos": 17858053854720.0, + "grad_norm": 1.6337711396331278, + "language_loss": 0.83918232, + "learning_rate": 1.5641848610453218e-06, + "loss": 0.86101133, + "num_input_tokens_seen": 104496355, + "step": 4847, + "time_per_iteration": 2.8050410747528076 + }, + { + "auxiliary_loss_clip": 0.01121879, + "auxiliary_loss_mlp": 0.01084215, + "balance_loss_clip": 1.02652144, + "balance_loss_mlp": 1.00386786, + "epoch": 0.582937533818313, + "flos": 19865244827520.0, + "grad_norm": 2.1425482350333187, + "language_loss": 0.85887378, + "learning_rate": 1.563424641621158e-06, + "loss": 0.88093472, + "num_input_tokens_seen": 104515535, + "step": 4848, + "time_per_iteration": 3.611069917678833 + }, + { + "auxiliary_loss_clip": 0.01119198, + "auxiliary_loss_mlp": 0.01086264, + "balance_loss_clip": 1.02608442, + "balance_loss_mlp": 1.00586879, + "epoch": 0.5830577767089521, + "flos": 26870734068480.0, + "grad_norm": 1.8172068655365878, + "language_loss": 0.69768733, + "learning_rate": 1.5626644884251282e-06, + "loss": 0.71974194, + "num_input_tokens_seen": 104535055, + "step": 4849, + "time_per_iteration": 2.84494948387146 + }, + { + "auxiliary_loss_clip": 0.01136209, + "auxiliary_loss_mlp": 0.01083867, + "balance_loss_clip": 1.0259223, + "balance_loss_mlp": 1.00366306, + "epoch": 0.5831780195995911, + "flos": 25298205575040.0, + "grad_norm": 1.7087576065039203, + "language_loss": 0.87873769, + "learning_rate": 1.5619044015725488e-06, + "loss": 0.90093845, + "num_input_tokens_seen": 104554745, + "step": 4850, + "time_per_iteration": 2.7444164752960205 + }, + { + "auxiliary_loss_clip": 0.01141034, + "auxiliary_loss_mlp": 0.01085655, + "balance_loss_clip": 1.03013635, + "balance_loss_mlp": 1.00521302, + "epoch": 0.5832982624902303, + "flos": 14756988049920.0, + "grad_norm": 2.1850490083700547, + "language_loss": 0.87312806, + "learning_rate": 1.5611443811787224e-06, + "loss": 0.89539498, + "num_input_tokens_seen": 104568870, + "step": 4851, + "time_per_iteration": 3.516103506088257 + }, + { + "auxiliary_loss_clip": 0.01127885, + "auxiliary_loss_mlp": 0.01084784, + "balance_loss_clip": 1.02681804, + "balance_loss_mlp": 1.0045799, + "epoch": 0.5834185053808694, + "flos": 20444555376000.0, + "grad_norm": 2.2732559307063407, + "language_loss": 0.69060576, + "learning_rate": 1.560384427358945e-06, + "loss": 0.71273249, + "num_input_tokens_seen": 104588415, + "step": 4852, + "time_per_iteration": 2.6703696250915527 + }, + { + "auxiliary_loss_clip": 0.01120313, + "auxiliary_loss_mlp": 0.01084968, + "balance_loss_clip": 1.025756, + "balance_loss_mlp": 1.00462115, + "epoch": 0.5835387482715084, + "flos": 27200394115200.0, + "grad_norm": 1.6146025310316527, + "language_loss": 0.72890818, + "learning_rate": 1.5596245402284998e-06, + "loss": 0.75096101, + "num_input_tokens_seen": 104611940, + "step": 4853, + "time_per_iteration": 2.8174619674682617 + }, + { + "auxiliary_loss_clip": 0.01129162, + "auxiliary_loss_mlp": 0.01085953, + "balance_loss_clip": 1.02750063, + "balance_loss_mlp": 1.00565386, + "epoch": 0.5836589911621476, + "flos": 16654615562880.0, + "grad_norm": 2.9526823314188437, + "language_loss": 0.81984735, + "learning_rate": 1.5588647199026619e-06, + "loss": 0.84199846, + "num_input_tokens_seen": 104629675, + "step": 4854, + "time_per_iteration": 2.6542553901672363 + }, + { + "auxiliary_loss_clip": 0.01139332, + "auxiliary_loss_mlp": 0.01086866, + "balance_loss_clip": 1.02901947, + "balance_loss_mlp": 1.006567, + "epoch": 0.5837792340527866, + "flos": 20446817932800.0, + "grad_norm": 2.5211260194276046, + "language_loss": 0.87478852, + "learning_rate": 1.5581049664966956e-06, + "loss": 0.8970505, + "num_input_tokens_seen": 104647435, + "step": 4855, + "time_per_iteration": 2.680788040161133 + }, + { + "auxiliary_loss_clip": 0.01074241, + "auxiliary_loss_mlp": 0.01079266, + "balance_loss_clip": 1.01918888, + "balance_loss_mlp": 1.0003022, + "epoch": 0.5838994769434257, + "flos": 65995480765440.0, + "grad_norm": 0.999921540803464, + "language_loss": 0.65163988, + "learning_rate": 1.5573452801258545e-06, + "loss": 0.67317498, + "num_input_tokens_seen": 104694605, + "step": 4856, + "time_per_iteration": 3.1787989139556885 + }, + { + "auxiliary_loss_clip": 0.01129842, + "auxiliary_loss_mlp": 0.01084577, + "balance_loss_clip": 1.02692056, + "balance_loss_mlp": 1.00422943, + "epoch": 0.5840197198340649, + "flos": 21470523546240.0, + "grad_norm": 2.385366127233611, + "language_loss": 0.63459718, + "learning_rate": 1.5565856609053824e-06, + "loss": 0.65674138, + "num_input_tokens_seen": 104713400, + "step": 4857, + "time_per_iteration": 2.700918197631836 + }, + { + "auxiliary_loss_clip": 0.01137769, + "auxiliary_loss_mlp": 0.01083661, + "balance_loss_clip": 1.02770138, + "balance_loss_mlp": 1.00336194, + "epoch": 0.5841399627247039, + "flos": 19135144984320.0, + "grad_norm": 1.7969854090433284, + "language_loss": 0.79781795, + "learning_rate": 1.5558261089505127e-06, + "loss": 0.82003224, + "num_input_tokens_seen": 104732130, + "step": 4858, + "time_per_iteration": 2.5824966430664062 + }, + { + "auxiliary_loss_clip": 0.01128323, + "auxiliary_loss_mlp": 0.01085559, + "balance_loss_clip": 1.02760994, + "balance_loss_mlp": 1.00521207, + "epoch": 0.584260205615343, + "flos": 26425692558720.0, + "grad_norm": 2.5806127858305987, + "language_loss": 0.79748094, + "learning_rate": 1.5550666243764697e-06, + "loss": 0.81961977, + "num_input_tokens_seen": 104750290, + "step": 4859, + "time_per_iteration": 2.7495288848876953 + }, + { + "auxiliary_loss_clip": 0.01125561, + "auxiliary_loss_mlp": 0.01085355, + "balance_loss_clip": 1.02522278, + "balance_loss_mlp": 1.00500822, + "epoch": 0.584380448505982, + "flos": 13881809174400.0, + "grad_norm": 2.103689888849914, + "language_loss": 0.77503735, + "learning_rate": 1.554307207298465e-06, + "loss": 0.79714656, + "num_input_tokens_seen": 104768550, + "step": 4860, + "time_per_iteration": 2.579665422439575 + }, + { + "auxiliary_loss_clip": 0.0113792, + "auxiliary_loss_mlp": 0.01084818, + "balance_loss_clip": 1.027179, + "balance_loss_mlp": 1.00451827, + "epoch": 0.5845006913966212, + "flos": 21543709507200.0, + "grad_norm": 1.84959188946736, + "language_loss": 0.79027653, + "learning_rate": 1.553547857831704e-06, + "loss": 0.81250393, + "num_input_tokens_seen": 104785060, + "step": 4861, + "time_per_iteration": 2.6672234535217285 + }, + { + "auxiliary_loss_clip": 0.01117388, + "auxiliary_loss_mlp": 0.01079132, + "balance_loss_clip": 1.02125716, + "balance_loss_mlp": 1.00016785, + "epoch": 0.5846209342872603, + "flos": 58375452712320.0, + "grad_norm": 0.8809808799461563, + "language_loss": 0.64189363, + "learning_rate": 1.5527885760913771e-06, + "loss": 0.66385889, + "num_input_tokens_seen": 104834950, + "step": 4862, + "time_per_iteration": 3.053853750228882 + }, + { + "auxiliary_loss_clip": 0.0111918, + "auxiliary_loss_mlp": 0.01085802, + "balance_loss_clip": 1.02660537, + "balance_loss_mlp": 1.00559831, + "epoch": 0.5847411771778993, + "flos": 18588045957120.0, + "grad_norm": 1.9266318778863805, + "language_loss": 0.76475692, + "learning_rate": 1.552029362192668e-06, + "loss": 0.7868067, + "num_input_tokens_seen": 104854210, + "step": 4863, + "time_per_iteration": 2.7058889865875244 + }, + { + "auxiliary_loss_clip": 0.01109174, + "auxiliary_loss_mlp": 0.01083895, + "balance_loss_clip": 1.02497876, + "balance_loss_mlp": 1.00354743, + "epoch": 0.5848614200685385, + "flos": 24240780069120.0, + "grad_norm": 1.7235481004777295, + "language_loss": 0.7229926, + "learning_rate": 1.5512702162507478e-06, + "loss": 0.74492329, + "num_input_tokens_seen": 104874525, + "step": 4864, + "time_per_iteration": 2.8229033946990967 + }, + { + "auxiliary_loss_clip": 0.01099839, + "auxiliary_loss_mlp": 0.0107936, + "balance_loss_clip": 1.02027059, + "balance_loss_mlp": 1.00039566, + "epoch": 0.5849816629591775, + "flos": 71660245933440.0, + "grad_norm": 1.1400874588805001, + "language_loss": 0.55769205, + "learning_rate": 1.5505111383807792e-06, + "loss": 0.57948405, + "num_input_tokens_seen": 104937195, + "step": 4865, + "time_per_iteration": 3.304699182510376 + }, + { + "auxiliary_loss_clip": 0.01099545, + "auxiliary_loss_mlp": 0.01085219, + "balance_loss_clip": 1.0235343, + "balance_loss_mlp": 1.00487149, + "epoch": 0.5851019058498166, + "flos": 23802095266560.0, + "grad_norm": 2.0756679674983682, + "language_loss": 0.80619359, + "learning_rate": 1.5497521286979138e-06, + "loss": 0.8280412, + "num_input_tokens_seen": 104957435, + "step": 4866, + "time_per_iteration": 2.848634719848633 + }, + { + "auxiliary_loss_clip": 0.01102026, + "auxiliary_loss_mlp": 0.01085114, + "balance_loss_clip": 1.02256656, + "balance_loss_mlp": 1.00462401, + "epoch": 0.5852221487404557, + "flos": 24388516707840.0, + "grad_norm": 1.8497441017905112, + "language_loss": 0.74299634, + "learning_rate": 1.5489931873172927e-06, + "loss": 0.76486778, + "num_input_tokens_seen": 104978755, + "step": 4867, + "time_per_iteration": 2.7971866130828857 + }, + { + "auxiliary_loss_clip": 0.01086014, + "auxiliary_loss_mlp": 0.01084802, + "balance_loss_clip": 1.02383018, + "balance_loss_mlp": 1.00445521, + "epoch": 0.5853423916310948, + "flos": 27271425260160.0, + "grad_norm": 1.563222814168009, + "language_loss": 0.7915535, + "learning_rate": 1.5482343143540467e-06, + "loss": 0.81326175, + "num_input_tokens_seen": 105000020, + "step": 4868, + "time_per_iteration": 2.9283883571624756 + }, + { + "auxiliary_loss_clip": 0.01087268, + "auxiliary_loss_mlp": 0.00872815, + "balance_loss_clip": 1.02447534, + "balance_loss_mlp": 1.00008905, + "epoch": 0.5854626345217339, + "flos": 11983786611840.0, + "grad_norm": 1.8105261243151478, + "language_loss": 0.82584023, + "learning_rate": 1.547475509923295e-06, + "loss": 0.8454411, + "num_input_tokens_seen": 105017060, + "step": 4869, + "time_per_iteration": 3.718637704849243 + }, + { + "auxiliary_loss_clip": 0.01066497, + "auxiliary_loss_mlp": 0.01078903, + "balance_loss_clip": 1.01983893, + "balance_loss_mlp": 0.99993914, + "epoch": 0.585582877412373, + "flos": 64342335173760.0, + "grad_norm": 0.7237006598654446, + "language_loss": 0.55990171, + "learning_rate": 1.5467167741401495e-06, + "loss": 0.58135575, + "num_input_tokens_seen": 105078540, + "step": 4870, + "time_per_iteration": 3.3400330543518066 + }, + { + "auxiliary_loss_clip": 0.01121686, + "auxiliary_loss_mlp": 0.0108416, + "balance_loss_clip": 1.02793014, + "balance_loss_mlp": 1.00371742, + "epoch": 0.5857031203030121, + "flos": 17011926103680.0, + "grad_norm": 2.045307983250781, + "language_loss": 0.7142297, + "learning_rate": 1.5459581071197083e-06, + "loss": 0.73628819, + "num_input_tokens_seen": 105094200, + "step": 4871, + "time_per_iteration": 3.624011993408203 + }, + { + "auxiliary_loss_clip": 0.0112942, + "auxiliary_loss_mlp": 0.0108445, + "balance_loss_clip": 1.02753067, + "balance_loss_mlp": 1.00405538, + "epoch": 0.5858233631936511, + "flos": 20885682303360.0, + "grad_norm": 2.269743958015963, + "language_loss": 0.83070451, + "learning_rate": 1.5451995089770624e-06, + "loss": 0.85284328, + "num_input_tokens_seen": 105113985, + "step": 4872, + "time_per_iteration": 2.7696385383605957 + }, + { + "auxiliary_loss_clip": 0.01136937, + "auxiliary_loss_mlp": 0.0108389, + "balance_loss_clip": 1.02642655, + "balance_loss_mlp": 1.00373411, + "epoch": 0.5859436060842903, + "flos": 23191902000000.0, + "grad_norm": 1.3445454521971594, + "language_loss": 0.71842444, + "learning_rate": 1.5444409798272885e-06, + "loss": 0.74063271, + "num_input_tokens_seen": 105138075, + "step": 4873, + "time_per_iteration": 3.6983277797698975 + }, + { + "auxiliary_loss_clip": 0.01110706, + "auxiliary_loss_mlp": 0.01083827, + "balance_loss_clip": 1.02610159, + "balance_loss_mlp": 1.00352764, + "epoch": 0.5860638489749294, + "flos": 22492648961280.0, + "grad_norm": 1.8673496066750774, + "language_loss": 0.80417323, + "learning_rate": 1.543682519785456e-06, + "loss": 0.82611859, + "num_input_tokens_seen": 105156555, + "step": 4874, + "time_per_iteration": 2.8284480571746826 + }, + { + "auxiliary_loss_clip": 0.01121932, + "auxiliary_loss_mlp": 0.01086541, + "balance_loss_clip": 1.02767897, + "balance_loss_mlp": 1.00633705, + "epoch": 0.5861840918655684, + "flos": 17566243764480.0, + "grad_norm": 3.8909424791019953, + "language_loss": 0.80333996, + "learning_rate": 1.5429241289666219e-06, + "loss": 0.82542473, + "num_input_tokens_seen": 105174055, + "step": 4875, + "time_per_iteration": 2.6907262802124023 + }, + { + "auxiliary_loss_clip": 0.01119151, + "auxiliary_loss_mlp": 0.01084182, + "balance_loss_clip": 1.02593732, + "balance_loss_mlp": 1.00393009, + "epoch": 0.5863043347562076, + "flos": 25556152118400.0, + "grad_norm": 1.8358437670559073, + "language_loss": 0.69863117, + "learning_rate": 1.5421658074858342e-06, + "loss": 0.7206645, + "num_input_tokens_seen": 105192160, + "step": 4876, + "time_per_iteration": 2.7380874156951904 + }, + { + "auxiliary_loss_clip": 0.01120675, + "auxiliary_loss_mlp": 0.01084931, + "balance_loss_clip": 1.02704871, + "balance_loss_mlp": 1.00453639, + "epoch": 0.5864245776468466, + "flos": 20667525050880.0, + "grad_norm": 2.2070677995373402, + "language_loss": 0.66065705, + "learning_rate": 1.5414075554581298e-06, + "loss": 0.68271309, + "num_input_tokens_seen": 105210205, + "step": 4877, + "time_per_iteration": 3.6549997329711914 + }, + { + "auxiliary_loss_clip": 0.01138175, + "auxiliary_loss_mlp": 0.01083509, + "balance_loss_clip": 1.02702403, + "balance_loss_mlp": 1.00320947, + "epoch": 0.5865448205374857, + "flos": 28913907490560.0, + "grad_norm": 2.4230401803719155, + "language_loss": 0.78271377, + "learning_rate": 1.5406493729985348e-06, + "loss": 0.80493063, + "num_input_tokens_seen": 105229400, + "step": 4878, + "time_per_iteration": 2.6475024223327637 + }, + { + "auxiliary_loss_clip": 0.01079471, + "auxiliary_loss_mlp": 0.00872958, + "balance_loss_clip": 1.02572525, + "balance_loss_mlp": 1.00002861, + "epoch": 0.5866650634281249, + "flos": 25842575168640.0, + "grad_norm": 2.0907564870830178, + "language_loss": 0.71927655, + "learning_rate": 1.5398912602220644e-06, + "loss": 0.73880088, + "num_input_tokens_seen": 105248675, + "step": 4879, + "time_per_iteration": 2.9060592651367188 + }, + { + "auxiliary_loss_clip": 0.0108752, + "auxiliary_loss_mlp": 0.01084683, + "balance_loss_clip": 1.02646196, + "balance_loss_mlp": 1.00433636, + "epoch": 0.5867853063187639, + "flos": 17052325925760.0, + "grad_norm": 2.0232252101641253, + "language_loss": 0.78417856, + "learning_rate": 1.539133217243724e-06, + "loss": 0.80590057, + "num_input_tokens_seen": 105265695, + "step": 4880, + "time_per_iteration": 2.826493978500366 + }, + { + "auxiliary_loss_clip": 0.01109569, + "auxiliary_loss_mlp": 0.01084541, + "balance_loss_clip": 1.02491117, + "balance_loss_mlp": 1.00405049, + "epoch": 0.586905549209403, + "flos": 24645026707200.0, + "grad_norm": 2.2825251692063837, + "language_loss": 0.76272303, + "learning_rate": 1.5383752441785081e-06, + "loss": 0.78466415, + "num_input_tokens_seen": 105284920, + "step": 4881, + "time_per_iteration": 2.8273189067840576 + }, + { + "auxiliary_loss_clip": 0.01130143, + "auxiliary_loss_mlp": 0.01086609, + "balance_loss_clip": 1.027933, + "balance_loss_mlp": 1.00616693, + "epoch": 0.5870257921000421, + "flos": 14720538723840.0, + "grad_norm": 2.1948816137886213, + "language_loss": 0.85689723, + "learning_rate": 1.5376173411414003e-06, + "loss": 0.8790648, + "num_input_tokens_seen": 105302960, + "step": 4882, + "time_per_iteration": 2.7614760398864746 + }, + { + "auxiliary_loss_clip": 0.01121074, + "auxiliary_loss_mlp": 0.0108501, + "balance_loss_clip": 1.02666044, + "balance_loss_mlp": 1.00471067, + "epoch": 0.5871460349906812, + "flos": 23914998691200.0, + "grad_norm": 1.973032058383268, + "language_loss": 0.78754902, + "learning_rate": 1.5368595082473753e-06, + "loss": 0.80960989, + "num_input_tokens_seen": 105321260, + "step": 4883, + "time_per_iteration": 2.743195056915283 + }, + { + "auxiliary_loss_clip": 0.01128645, + "auxiliary_loss_mlp": 0.01084187, + "balance_loss_clip": 1.02644145, + "balance_loss_mlp": 1.0038873, + "epoch": 0.5872662778813202, + "flos": 22164174063360.0, + "grad_norm": 2.2573754779138944, + "language_loss": 0.77929908, + "learning_rate": 1.5361017456113935e-06, + "loss": 0.80142742, + "num_input_tokens_seen": 105341610, + "step": 4884, + "time_per_iteration": 2.701570510864258 + }, + { + "auxiliary_loss_clip": 0.01129009, + "auxiliary_loss_mlp": 0.01085789, + "balance_loss_clip": 1.02668786, + "balance_loss_mlp": 1.00539446, + "epoch": 0.5873865207719594, + "flos": 18441925430400.0, + "grad_norm": 3.0648926614525958, + "language_loss": 0.85572428, + "learning_rate": 1.5353440533484085e-06, + "loss": 0.87787229, + "num_input_tokens_seen": 105360465, + "step": 4885, + "time_per_iteration": 2.6980783939361572 + }, + { + "auxiliary_loss_clip": 0.01119636, + "auxiliary_loss_mlp": 0.01085739, + "balance_loss_clip": 1.0269227, + "balance_loss_mlp": 1.00543964, + "epoch": 0.5875067636625985, + "flos": 54015321427200.0, + "grad_norm": 1.6772545781432275, + "language_loss": 0.65659404, + "learning_rate": 1.534586431573361e-06, + "loss": 0.67864776, + "num_input_tokens_seen": 105385405, + "step": 4886, + "time_per_iteration": 3.046384572982788 + }, + { + "auxiliary_loss_clip": 0.01092213, + "auxiliary_loss_mlp": 0.01085833, + "balance_loss_clip": 1.02499151, + "balance_loss_mlp": 1.00539041, + "epoch": 0.5876270065532375, + "flos": 27995707100160.0, + "grad_norm": 2.2127393383343708, + "language_loss": 0.79283577, + "learning_rate": 1.5338288804011817e-06, + "loss": 0.81461626, + "num_input_tokens_seen": 105404905, + "step": 4887, + "time_per_iteration": 2.8691675662994385 + }, + { + "auxiliary_loss_clip": 0.0111921, + "auxiliary_loss_mlp": 0.01084198, + "balance_loss_clip": 1.02479696, + "balance_loss_mlp": 1.00380349, + "epoch": 0.5877472494438767, + "flos": 21361462876800.0, + "grad_norm": 2.0154141072017047, + "language_loss": 0.70947552, + "learning_rate": 1.533071399946791e-06, + "loss": 0.73150957, + "num_input_tokens_seen": 105423650, + "step": 4888, + "time_per_iteration": 2.758145332336426 + }, + { + "auxiliary_loss_clip": 0.01101386, + "auxiliary_loss_mlp": 0.01084565, + "balance_loss_clip": 1.02506995, + "balance_loss_mlp": 1.00426579, + "epoch": 0.5878674923345157, + "flos": 22383013674240.0, + "grad_norm": 1.680368359413698, + "language_loss": 0.57223535, + "learning_rate": 1.5323139903250977e-06, + "loss": 0.59409481, + "num_input_tokens_seen": 105444255, + "step": 4889, + "time_per_iteration": 2.718212604522705 + }, + { + "auxiliary_loss_clip": 0.0111941, + "auxiliary_loss_mlp": 0.01086074, + "balance_loss_clip": 1.02667522, + "balance_loss_mlp": 1.00582242, + "epoch": 0.5879877352251548, + "flos": 21868664872320.0, + "grad_norm": 1.9293437832918299, + "language_loss": 0.76824665, + "learning_rate": 1.5315566516510002e-06, + "loss": 0.7903015, + "num_input_tokens_seen": 105462425, + "step": 4890, + "time_per_iteration": 2.6774826049804688 + }, + { + "auxiliary_loss_clip": 0.01138614, + "auxiliary_loss_mlp": 0.01083947, + "balance_loss_clip": 1.02825069, + "balance_loss_mlp": 1.00364745, + "epoch": 0.5881079781157939, + "flos": 17493811989120.0, + "grad_norm": 1.6926248376666764, + "language_loss": 0.6744234, + "learning_rate": 1.5307993840393857e-06, + "loss": 0.69664896, + "num_input_tokens_seen": 105480505, + "step": 4891, + "time_per_iteration": 2.6610107421875 + }, + { + "auxiliary_loss_clip": 0.01137565, + "auxiliary_loss_mlp": 0.01085197, + "balance_loss_clip": 1.02640057, + "balance_loss_mlp": 1.0049932, + "epoch": 0.588228221006433, + "flos": 22601853285120.0, + "grad_norm": 1.7353226490678908, + "language_loss": 0.80244482, + "learning_rate": 1.530042187605132e-06, + "loss": 0.8246724, + "num_input_tokens_seen": 105499760, + "step": 4892, + "time_per_iteration": 2.620115041732788 + }, + { + "auxiliary_loss_clip": 0.01128794, + "auxiliary_loss_mlp": 0.00872783, + "balance_loss_clip": 1.02669585, + "balance_loss_mlp": 1.0000484, + "epoch": 0.5883484638970721, + "flos": 26176939896960.0, + "grad_norm": 1.412892579764528, + "language_loss": 0.8404727, + "learning_rate": 1.5292850624631044e-06, + "loss": 0.86048841, + "num_input_tokens_seen": 105521955, + "step": 4893, + "time_per_iteration": 2.7704739570617676 + }, + { + "auxiliary_loss_clip": 0.01120585, + "auxiliary_loss_mlp": 0.01085339, + "balance_loss_clip": 1.02521157, + "balance_loss_mlp": 1.00503981, + "epoch": 0.5884687067877111, + "flos": 30443737691520.0, + "grad_norm": 1.8561832088811223, + "language_loss": 0.79757047, + "learning_rate": 1.5285280087281593e-06, + "loss": 0.81962967, + "num_input_tokens_seen": 105542685, + "step": 4894, + "time_per_iteration": 2.775665044784546 + }, + { + "auxiliary_loss_clip": 0.01099981, + "auxiliary_loss_mlp": 0.01079325, + "balance_loss_clip": 1.02029634, + "balance_loss_mlp": 0.99997884, + "epoch": 0.5885889496783503, + "flos": 70507550580480.0, + "grad_norm": 0.6491537531834051, + "language_loss": 0.56660688, + "learning_rate": 1.5277710265151398e-06, + "loss": 0.58839989, + "num_input_tokens_seen": 105612165, + "step": 4895, + "time_per_iteration": 4.3899500370025635 + }, + { + "auxiliary_loss_clip": 0.01129936, + "auxiliary_loss_mlp": 0.01084002, + "balance_loss_clip": 1.02682793, + "balance_loss_mlp": 1.00356007, + "epoch": 0.5887091925689893, + "flos": 19098767485440.0, + "grad_norm": 3.4011251152389046, + "language_loss": 0.76863194, + "learning_rate": 1.5270141159388803e-06, + "loss": 0.79077125, + "num_input_tokens_seen": 105629185, + "step": 4896, + "time_per_iteration": 3.5240347385406494 + }, + { + "auxiliary_loss_clip": 0.01136885, + "auxiliary_loss_mlp": 0.01083752, + "balance_loss_clip": 1.02655554, + "balance_loss_mlp": 1.00335705, + "epoch": 0.5888294354596284, + "flos": 23294282739840.0, + "grad_norm": 1.6430909455756635, + "language_loss": 0.80448049, + "learning_rate": 1.526257277114203e-06, + "loss": 0.82668686, + "num_input_tokens_seen": 105650260, + "step": 4897, + "time_per_iteration": 2.654562473297119 + }, + { + "auxiliary_loss_clip": 0.01115992, + "auxiliary_loss_mlp": 0.0108405, + "balance_loss_clip": 1.02458787, + "balance_loss_mlp": 1.00375009, + "epoch": 0.5889496783502676, + "flos": 21981532383360.0, + "grad_norm": 2.3237867837616557, + "language_loss": 0.790636, + "learning_rate": 1.5255005101559201e-06, + "loss": 0.81263649, + "num_input_tokens_seen": 105667870, + "step": 4898, + "time_per_iteration": 2.7089834213256836 + }, + { + "auxiliary_loss_clip": 0.0111325, + "auxiliary_loss_mlp": 0.01084475, + "balance_loss_clip": 1.0271939, + "balance_loss_mlp": 1.00417566, + "epoch": 0.5890699212409066, + "flos": 21685233093120.0, + "grad_norm": 2.390306298000828, + "language_loss": 0.76630431, + "learning_rate": 1.524743815178833e-06, + "loss": 0.78828156, + "num_input_tokens_seen": 105685830, + "step": 4899, + "time_per_iteration": 3.754817485809326 + }, + { + "auxiliary_loss_clip": 0.01117858, + "auxiliary_loss_mlp": 0.01085707, + "balance_loss_clip": 1.02410281, + "balance_loss_mlp": 1.00550318, + "epoch": 0.5891901641315457, + "flos": 19464553635840.0, + "grad_norm": 2.421721461202596, + "language_loss": 0.80893528, + "learning_rate": 1.5239871922977315e-06, + "loss": 0.83097088, + "num_input_tokens_seen": 105705745, + "step": 4900, + "time_per_iteration": 2.7169175148010254 + }, + { + "auxiliary_loss_clip": 0.01122136, + "auxiliary_loss_mlp": 0.01084359, + "balance_loss_clip": 1.02799761, + "balance_loss_mlp": 1.00410771, + "epoch": 0.5893104070221848, + "flos": 19609884063360.0, + "grad_norm": 1.8727313712315783, + "language_loss": 0.89659476, + "learning_rate": 1.523230641627394e-06, + "loss": 0.91865981, + "num_input_tokens_seen": 105724730, + "step": 4901, + "time_per_iteration": 2.799717664718628 + }, + { + "auxiliary_loss_clip": 0.0110146, + "auxiliary_loss_mlp": 0.01084275, + "balance_loss_clip": 1.02481985, + "balance_loss_mlp": 1.00397587, + "epoch": 0.5894306499128239, + "flos": 29060063930880.0, + "grad_norm": 2.748492708130809, + "language_loss": 0.72705019, + "learning_rate": 1.5224741632825888e-06, + "loss": 0.74890757, + "num_input_tokens_seen": 105744920, + "step": 4902, + "time_per_iteration": 2.8301148414611816 + }, + { + "auxiliary_loss_clip": 0.01138854, + "auxiliary_loss_mlp": 0.01087732, + "balance_loss_clip": 1.02827239, + "balance_loss_mlp": 1.00728941, + "epoch": 0.589550892803463, + "flos": 42298890721920.0, + "grad_norm": 2.572055165388447, + "language_loss": 0.69335818, + "learning_rate": 1.521717757378074e-06, + "loss": 0.71562409, + "num_input_tokens_seen": 105765465, + "step": 4903, + "time_per_iteration": 3.68232798576355 + }, + { + "auxiliary_loss_clip": 0.01129087, + "auxiliary_loss_mlp": 0.0108405, + "balance_loss_clip": 1.02615118, + "balance_loss_mlp": 1.00379789, + "epoch": 0.5896711356941021, + "flos": 14137062197760.0, + "grad_norm": 1.9011992707970613, + "language_loss": 0.69449699, + "learning_rate": 1.5209614240285943e-06, + "loss": 0.71662837, + "num_input_tokens_seen": 105783120, + "step": 4904, + "time_per_iteration": 2.6979687213897705 + }, + { + "auxiliary_loss_clip": 0.01136825, + "auxiliary_loss_mlp": 0.00872818, + "balance_loss_clip": 1.02658153, + "balance_loss_mlp": 1.0000453, + "epoch": 0.5897913785847412, + "flos": 17201355454080.0, + "grad_norm": 1.9092352927417295, + "language_loss": 0.84663594, + "learning_rate": 1.520205163348887e-06, + "loss": 0.86673236, + "num_input_tokens_seen": 105801055, + "step": 4905, + "time_per_iteration": 2.5712828636169434 + }, + { + "auxiliary_loss_clip": 0.0109273, + "auxiliary_loss_mlp": 0.01079207, + "balance_loss_clip": 1.02063608, + "balance_loss_mlp": 1.00024247, + "epoch": 0.5899116214753802, + "flos": 48794164202880.0, + "grad_norm": 1.3102130316387404, + "language_loss": 0.56936353, + "learning_rate": 1.519448975453674e-06, + "loss": 0.59108287, + "num_input_tokens_seen": 105856155, + "step": 4906, + "time_per_iteration": 3.259784698486328 + }, + { + "auxiliary_loss_clip": 0.01128034, + "auxiliary_loss_mlp": 0.00872881, + "balance_loss_clip": 1.02707708, + "balance_loss_mlp": 1.00002587, + "epoch": 0.5900318643660194, + "flos": 21103659987840.0, + "grad_norm": 2.1510157149275533, + "language_loss": 0.75988305, + "learning_rate": 1.5186928604576696e-06, + "loss": 0.77989221, + "num_input_tokens_seen": 105873350, + "step": 4907, + "time_per_iteration": 2.627476453781128 + }, + { + "auxiliary_loss_clip": 0.0112089, + "auxiliary_loss_mlp": 0.01084719, + "balance_loss_clip": 1.02732253, + "balance_loss_mlp": 1.00451493, + "epoch": 0.5901521072566585, + "flos": 21178390233600.0, + "grad_norm": 2.3104481165974744, + "language_loss": 0.76906973, + "learning_rate": 1.5179368184755752e-06, + "loss": 0.79112583, + "num_input_tokens_seen": 105891435, + "step": 4908, + "time_per_iteration": 2.684574604034424 + }, + { + "auxiliary_loss_clip": 0.0111824, + "auxiliary_loss_mlp": 0.0108414, + "balance_loss_clip": 1.02549577, + "balance_loss_mlp": 1.00388789, + "epoch": 0.5902723501472975, + "flos": 20225967160320.0, + "grad_norm": 1.4810698008592538, + "language_loss": 0.82448298, + "learning_rate": 1.5171808496220821e-06, + "loss": 0.84650671, + "num_input_tokens_seen": 105910190, + "step": 4909, + "time_per_iteration": 2.6138901710510254 + }, + { + "auxiliary_loss_clip": 0.01119866, + "auxiliary_loss_mlp": 0.01085601, + "balance_loss_clip": 1.02583098, + "balance_loss_mlp": 1.00530195, + "epoch": 0.5903925930379367, + "flos": 22964407211520.0, + "grad_norm": 1.807101695064102, + "language_loss": 0.8150022, + "learning_rate": 1.5164249540118708e-06, + "loss": 0.83705688, + "num_input_tokens_seen": 105929315, + "step": 4910, + "time_per_iteration": 2.6561338901519775 + }, + { + "auxiliary_loss_clip": 0.01092516, + "auxiliary_loss_mlp": 0.0108449, + "balance_loss_clip": 1.02475679, + "balance_loss_mlp": 1.00428617, + "epoch": 0.5905128359285757, + "flos": 23367720096000.0, + "grad_norm": 1.6062827604154435, + "language_loss": 0.82977176, + "learning_rate": 1.5156691317596093e-06, + "loss": 0.85154188, + "num_input_tokens_seen": 105950740, + "step": 4911, + "time_per_iteration": 2.719876766204834 + }, + { + "auxiliary_loss_clip": 0.0113086, + "auxiliary_loss_mlp": 0.00872796, + "balance_loss_clip": 1.02811813, + "balance_loss_mlp": 1.00012898, + "epoch": 0.5906330788192148, + "flos": 28032335994240.0, + "grad_norm": 2.0217439306199836, + "language_loss": 0.66441846, + "learning_rate": 1.5149133829799556e-06, + "loss": 0.68445504, + "num_input_tokens_seen": 105968735, + "step": 4912, + "time_per_iteration": 2.687401056289673 + }, + { + "auxiliary_loss_clip": 0.01105892, + "auxiliary_loss_mlp": 0.01085076, + "balance_loss_clip": 1.02779114, + "balance_loss_mlp": 1.00472927, + "epoch": 0.590753321709854, + "flos": 18477943793280.0, + "grad_norm": 1.8018226048329786, + "language_loss": 0.80814564, + "learning_rate": 1.5141577077875556e-06, + "loss": 0.8300553, + "num_input_tokens_seen": 105986060, + "step": 4913, + "time_per_iteration": 2.7531979084014893 + }, + { + "auxiliary_loss_clip": 0.01130012, + "auxiliary_loss_mlp": 0.01085282, + "balance_loss_clip": 1.02761376, + "balance_loss_mlp": 1.00493491, + "epoch": 0.590873564600493, + "flos": 16873706568960.0, + "grad_norm": 2.2101364384314657, + "language_loss": 0.72156113, + "learning_rate": 1.5134021062970451e-06, + "loss": 0.74371409, + "num_input_tokens_seen": 106004440, + "step": 4914, + "time_per_iteration": 2.6430022716522217 + }, + { + "auxiliary_loss_clip": 0.01104766, + "auxiliary_loss_mlp": 0.01085109, + "balance_loss_clip": 1.022192, + "balance_loss_mlp": 1.00485778, + "epoch": 0.5909938074911321, + "flos": 13516166678400.0, + "grad_norm": 2.052244475302499, + "language_loss": 0.80641735, + "learning_rate": 1.5126465786230483e-06, + "loss": 0.82831609, + "num_input_tokens_seen": 106021215, + "step": 4915, + "time_per_iteration": 2.8185975551605225 + }, + { + "auxiliary_loss_clip": 0.01136759, + "auxiliary_loss_mlp": 0.01084681, + "balance_loss_clip": 1.02654576, + "balance_loss_mlp": 1.00433421, + "epoch": 0.5911140503817712, + "flos": 26024067613440.0, + "grad_norm": 1.9938097420437721, + "language_loss": 0.82195109, + "learning_rate": 1.5118911248801787e-06, + "loss": 0.8441655, + "num_input_tokens_seen": 106039225, + "step": 4916, + "time_per_iteration": 2.6511640548706055 + }, + { + "auxiliary_loss_clip": 0.01126507, + "auxiliary_loss_mlp": 0.01084257, + "balance_loss_clip": 1.02579486, + "balance_loss_mlp": 1.00405288, + "epoch": 0.5912342932724103, + "flos": 23258731253760.0, + "grad_norm": 2.1189994601446513, + "language_loss": 0.79909372, + "learning_rate": 1.5111357451830364e-06, + "loss": 0.82120144, + "num_input_tokens_seen": 106057920, + "step": 4917, + "time_per_iteration": 2.6659035682678223 + }, + { + "auxiliary_loss_clip": 0.01126399, + "auxiliary_loss_mlp": 0.0108393, + "balance_loss_clip": 1.02486527, + "balance_loss_mlp": 1.00372612, + "epoch": 0.5913545361630493, + "flos": 19573039687680.0, + "grad_norm": 1.6904294186535012, + "language_loss": 0.71007574, + "learning_rate": 1.5103804396462131e-06, + "loss": 0.73217905, + "num_input_tokens_seen": 106077855, + "step": 4918, + "time_per_iteration": 2.6107356548309326 + }, + { + "auxiliary_loss_clip": 0.01129467, + "auxiliary_loss_mlp": 0.01085319, + "balance_loss_clip": 1.02657819, + "balance_loss_mlp": 1.00487626, + "epoch": 0.5914747790536885, + "flos": 26213532877440.0, + "grad_norm": 2.3684424593937576, + "language_loss": 0.79947412, + "learning_rate": 1.5096252083842877e-06, + "loss": 0.82162201, + "num_input_tokens_seen": 106097065, + "step": 4919, + "time_per_iteration": 2.69695782661438 + }, + { + "auxiliary_loss_clip": 0.0112915, + "auxiliary_loss_mlp": 0.01085571, + "balance_loss_clip": 1.02665591, + "balance_loss_mlp": 1.00512874, + "epoch": 0.5915950219443276, + "flos": 27417545786880.0, + "grad_norm": 1.846730117035271, + "language_loss": 0.8533113, + "learning_rate": 1.5088700515118285e-06, + "loss": 0.87545848, + "num_input_tokens_seen": 106116385, + "step": 4920, + "time_per_iteration": 3.6258368492126465 + }, + { + "auxiliary_loss_clip": 0.01109582, + "auxiliary_loss_mlp": 0.01086195, + "balance_loss_clip": 1.02499163, + "balance_loss_mlp": 1.00584793, + "epoch": 0.5917152648349666, + "flos": 21907879545600.0, + "grad_norm": 2.9192233486069274, + "language_loss": 0.66751933, + "learning_rate": 1.508114969143392e-06, + "loss": 0.68947709, + "num_input_tokens_seen": 106136370, + "step": 4921, + "time_per_iteration": 3.650932788848877 + }, + { + "auxiliary_loss_clip": 0.01119149, + "auxiliary_loss_mlp": 0.0108353, + "balance_loss_clip": 1.02524924, + "balance_loss_mlp": 1.00342107, + "epoch": 0.5918355077256057, + "flos": 28109185142400.0, + "grad_norm": 2.5506520167135958, + "language_loss": 0.77425885, + "learning_rate": 1.5073599613935238e-06, + "loss": 0.79628563, + "num_input_tokens_seen": 106158490, + "step": 4922, + "time_per_iteration": 2.7817344665527344 + }, + { + "auxiliary_loss_clip": 0.01119088, + "auxiliary_loss_mlp": 0.01083912, + "balance_loss_clip": 1.02680039, + "balance_loss_mlp": 1.00356483, + "epoch": 0.5919557506162448, + "flos": 28183807647360.0, + "grad_norm": 1.9584034650894147, + "language_loss": 0.57210505, + "learning_rate": 1.5066050283767574e-06, + "loss": 0.59413505, + "num_input_tokens_seen": 106179170, + "step": 4923, + "time_per_iteration": 3.683147668838501 + }, + { + "auxiliary_loss_clip": 0.01106159, + "auxiliary_loss_mlp": 0.01084578, + "balance_loss_clip": 1.02810407, + "balance_loss_mlp": 1.00437355, + "epoch": 0.5920759935068839, + "flos": 12094355652480.0, + "grad_norm": 1.825971027622677, + "language_loss": 0.82896125, + "learning_rate": 1.505850170207616e-06, + "loss": 0.85086864, + "num_input_tokens_seen": 106196035, + "step": 4924, + "time_per_iteration": 2.7444441318511963 + }, + { + "auxiliary_loss_clip": 0.01118389, + "auxiliary_loss_mlp": 0.01084732, + "balance_loss_clip": 1.02526879, + "balance_loss_mlp": 1.00438547, + "epoch": 0.592196236397523, + "flos": 29424772673280.0, + "grad_norm": 2.3495651026092794, + "language_loss": 0.78309774, + "learning_rate": 1.505095387000611e-06, + "loss": 0.80512893, + "num_input_tokens_seen": 106218335, + "step": 4925, + "time_per_iteration": 2.8067808151245117 + }, + { + "auxiliary_loss_clip": 0.01120112, + "auxiliary_loss_mlp": 0.01085403, + "balance_loss_clip": 1.02694535, + "balance_loss_mlp": 1.00515139, + "epoch": 0.5923164792881621, + "flos": 24384709866240.0, + "grad_norm": 2.7465844322328046, + "language_loss": 0.74352705, + "learning_rate": 1.504340678870242e-06, + "loss": 0.7655822, + "num_input_tokens_seen": 106236550, + "step": 4926, + "time_per_iteration": 2.722306728363037 + }, + { + "auxiliary_loss_clip": 0.01127642, + "auxiliary_loss_mlp": 0.0108584, + "balance_loss_clip": 1.02633131, + "balance_loss_mlp": 1.00539768, + "epoch": 0.5924367221788012, + "flos": 24024238928640.0, + "grad_norm": 2.5732824604575577, + "language_loss": 0.89879555, + "learning_rate": 1.5035860459309989e-06, + "loss": 0.92093039, + "num_input_tokens_seen": 106254265, + "step": 4927, + "time_per_iteration": 3.6078574657440186 + }, + { + "auxiliary_loss_clip": 0.01118121, + "auxiliary_loss_mlp": 0.01085739, + "balance_loss_clip": 1.02485275, + "balance_loss_mlp": 1.00524914, + "epoch": 0.5925569650694402, + "flos": 26870590414080.0, + "grad_norm": 2.1090138103650933, + "language_loss": 0.63582146, + "learning_rate": 1.5028314882973568e-06, + "loss": 0.65786004, + "num_input_tokens_seen": 106274670, + "step": 4928, + "time_per_iteration": 2.729449510574341 + }, + { + "auxiliary_loss_clip": 0.01117997, + "auxiliary_loss_mlp": 0.01084844, + "balance_loss_clip": 1.02462327, + "balance_loss_mlp": 1.00454497, + "epoch": 0.5926772079600794, + "flos": 22302788647680.0, + "grad_norm": 1.8860364350579675, + "language_loss": 0.84523153, + "learning_rate": 1.502077006083783e-06, + "loss": 0.86725992, + "num_input_tokens_seen": 106293330, + "step": 4929, + "time_per_iteration": 2.7365424633026123 + }, + { + "auxiliary_loss_clip": 0.01114142, + "auxiliary_loss_mlp": 0.00872848, + "balance_loss_clip": 1.02815938, + "balance_loss_mlp": 1.00005639, + "epoch": 0.5927974508507184, + "flos": 19865244827520.0, + "grad_norm": 1.8794597093012935, + "language_loss": 0.76496232, + "learning_rate": 1.5013225994047315e-06, + "loss": 0.78483224, + "num_input_tokens_seen": 106310960, + "step": 4930, + "time_per_iteration": 2.7045910358428955 + }, + { + "auxiliary_loss_clip": 0.01128113, + "auxiliary_loss_mlp": 0.00872863, + "balance_loss_clip": 1.02620816, + "balance_loss_mlp": 1.00006628, + "epoch": 0.5929176937413575, + "flos": 15776743167360.0, + "grad_norm": 1.605270456049272, + "language_loss": 0.80823237, + "learning_rate": 1.5005682683746452e-06, + "loss": 0.82824212, + "num_input_tokens_seen": 106329475, + "step": 4931, + "time_per_iteration": 2.6849584579467773 + }, + { + "auxiliary_loss_clip": 0.01128615, + "auxiliary_loss_mlp": 0.01086705, + "balance_loss_clip": 1.02730858, + "balance_loss_mlp": 1.00640583, + "epoch": 0.5930379366319967, + "flos": 17601472028160.0, + "grad_norm": 2.394151711142598, + "language_loss": 0.72679424, + "learning_rate": 1.4998140131079553e-06, + "loss": 0.74894744, + "num_input_tokens_seen": 106345565, + "step": 4932, + "time_per_iteration": 2.680950403213501 + }, + { + "auxiliary_loss_clip": 0.01082855, + "auxiliary_loss_mlp": 0.00872893, + "balance_loss_clip": 1.02271771, + "balance_loss_mlp": 1.00003088, + "epoch": 0.5931581795226357, + "flos": 17704283731200.0, + "grad_norm": 1.8930103508195022, + "language_loss": 0.73412299, + "learning_rate": 1.4990598337190821e-06, + "loss": 0.75368047, + "num_input_tokens_seen": 106361920, + "step": 4933, + "time_per_iteration": 2.7910995483398438 + }, + { + "auxiliary_loss_clip": 0.01138786, + "auxiliary_loss_mlp": 0.00872942, + "balance_loss_clip": 1.0280931, + "balance_loss_mlp": 1.00003743, + "epoch": 0.5932784224132748, + "flos": 24280102483200.0, + "grad_norm": 1.8053668891834533, + "language_loss": 0.67735255, + "learning_rate": 1.4983057303224338e-06, + "loss": 0.69746989, + "num_input_tokens_seen": 106381735, + "step": 4934, + "time_per_iteration": 2.6955974102020264 + }, + { + "auxiliary_loss_clip": 0.01100004, + "auxiliary_loss_mlp": 0.01084096, + "balance_loss_clip": 1.02453351, + "balance_loss_mlp": 1.00370145, + "epoch": 0.5933986653039139, + "flos": 22926700909440.0, + "grad_norm": 1.719688304380859, + "language_loss": 0.8741827, + "learning_rate": 1.4975517030324072e-06, + "loss": 0.89602369, + "num_input_tokens_seen": 106399745, + "step": 4935, + "time_per_iteration": 2.8137545585632324 + }, + { + "auxiliary_loss_clip": 0.01116615, + "auxiliary_loss_mlp": 0.0087317, + "balance_loss_clip": 1.02081704, + "balance_loss_mlp": 1.00171709, + "epoch": 0.593518908194553, + "flos": 71121730256640.0, + "grad_norm": 0.7831556669051254, + "language_loss": 0.61838043, + "learning_rate": 1.4967977519633882e-06, + "loss": 0.63827837, + "num_input_tokens_seen": 106457205, + "step": 4936, + "time_per_iteration": 3.349548816680908 + }, + { + "auxiliary_loss_clip": 0.01107482, + "auxiliary_loss_mlp": 0.01086742, + "balance_loss_clip": 1.02382898, + "balance_loss_mlp": 1.00644255, + "epoch": 0.593639151085192, + "flos": 20448649526400.0, + "grad_norm": 1.8200839443813812, + "language_loss": 0.78110576, + "learning_rate": 1.4960438772297494e-06, + "loss": 0.8030479, + "num_input_tokens_seen": 106474250, + "step": 4937, + "time_per_iteration": 2.7323522567749023 + }, + { + "auxiliary_loss_clip": 0.01120906, + "auxiliary_loss_mlp": 0.01084563, + "balance_loss_clip": 1.02597702, + "balance_loss_mlp": 1.00416803, + "epoch": 0.5937593939758312, + "flos": 30883428074880.0, + "grad_norm": 1.9844049040238287, + "language_loss": 0.7355814, + "learning_rate": 1.495290078945855e-06, + "loss": 0.75763607, + "num_input_tokens_seen": 106494015, + "step": 4938, + "time_per_iteration": 2.7719810009002686 + }, + { + "auxiliary_loss_clip": 0.0113752, + "auxiliary_loss_mlp": 0.01085016, + "balance_loss_clip": 1.02760601, + "balance_loss_mlp": 1.00466859, + "epoch": 0.5938796368664703, + "flos": 36898069668480.0, + "grad_norm": 1.9700299685802514, + "language_loss": 0.74111474, + "learning_rate": 1.4945363572260529e-06, + "loss": 0.76334, + "num_input_tokens_seen": 106515010, + "step": 4939, + "time_per_iteration": 2.7441489696502686 + }, + { + "auxiliary_loss_clip": 0.01127084, + "auxiliary_loss_mlp": 0.0108511, + "balance_loss_clip": 1.02564645, + "balance_loss_mlp": 1.00476313, + "epoch": 0.5939998797571093, + "flos": 23842926051840.0, + "grad_norm": 2.0511124005166246, + "language_loss": 0.67974073, + "learning_rate": 1.4937827121846845e-06, + "loss": 0.70186269, + "num_input_tokens_seen": 106535265, + "step": 4940, + "time_per_iteration": 2.725787878036499 + }, + { + "auxiliary_loss_clip": 0.01104111, + "auxiliary_loss_mlp": 0.01085988, + "balance_loss_clip": 1.02230489, + "balance_loss_mlp": 1.00568902, + "epoch": 0.5941201226477485, + "flos": 25191407462400.0, + "grad_norm": 1.4478207461365014, + "language_loss": 0.73457199, + "learning_rate": 1.4930291439360755e-06, + "loss": 0.75647295, + "num_input_tokens_seen": 106557830, + "step": 4941, + "time_per_iteration": 2.8465631008148193 + }, + { + "auxiliary_loss_clip": 0.01126807, + "auxiliary_loss_mlp": 0.01086095, + "balance_loss_clip": 1.02554107, + "balance_loss_mlp": 1.00570011, + "epoch": 0.5942403655383875, + "flos": 22418996123520.0, + "grad_norm": 1.7411339962321657, + "language_loss": 0.79259741, + "learning_rate": 1.4922756525945427e-06, + "loss": 0.81472635, + "num_input_tokens_seen": 106577140, + "step": 4942, + "time_per_iteration": 2.685659408569336 + }, + { + "auxiliary_loss_clip": 0.01092287, + "auxiliary_loss_mlp": 0.01079829, + "balance_loss_clip": 1.02131653, + "balance_loss_mlp": 1.00048351, + "epoch": 0.5943606084290266, + "flos": 67629310796160.0, + "grad_norm": 0.7713045473297231, + "language_loss": 0.59590298, + "learning_rate": 1.4915222382743894e-06, + "loss": 0.6176241, + "num_input_tokens_seen": 106635975, + "step": 4943, + "time_per_iteration": 3.3054332733154297 + }, + { + "auxiliary_loss_clip": 0.01127603, + "auxiliary_loss_mlp": 0.01084799, + "balance_loss_clip": 1.02680457, + "balance_loss_mlp": 1.00445175, + "epoch": 0.5944808513196658, + "flos": 18223157646720.0, + "grad_norm": 2.1625580186274553, + "language_loss": 0.72256017, + "learning_rate": 1.4907689010899085e-06, + "loss": 0.74468422, + "num_input_tokens_seen": 106653555, + "step": 4944, + "time_per_iteration": 2.6083121299743652 + }, + { + "auxiliary_loss_clip": 0.01115863, + "auxiliary_loss_mlp": 0.01084472, + "balance_loss_clip": 1.02355814, + "balance_loss_mlp": 1.00417256, + "epoch": 0.5946010942103048, + "flos": 24790824011520.0, + "grad_norm": 2.0367698207087375, + "language_loss": 0.62494922, + "learning_rate": 1.4900156411553804e-06, + "loss": 0.64695263, + "num_input_tokens_seen": 106673385, + "step": 4945, + "time_per_iteration": 2.759429454803467 + }, + { + "auxiliary_loss_clip": 0.01117947, + "auxiliary_loss_mlp": 0.01084242, + "balance_loss_clip": 1.02543259, + "balance_loss_mlp": 1.00399017, + "epoch": 0.5947213371009439, + "flos": 15231619388160.0, + "grad_norm": 1.8730206609782405, + "language_loss": 0.85612667, + "learning_rate": 1.4892624585850739e-06, + "loss": 0.87814856, + "num_input_tokens_seen": 106691740, + "step": 4946, + "time_per_iteration": 3.680619955062866 + }, + { + "auxiliary_loss_clip": 0.01137249, + "auxiliary_loss_mlp": 0.01084211, + "balance_loss_clip": 1.02670193, + "balance_loss_mlp": 1.00372124, + "epoch": 0.594841579991583, + "flos": 25848069949440.0, + "grad_norm": 1.873531374929802, + "language_loss": 0.79841787, + "learning_rate": 1.4885093534932465e-06, + "loss": 0.82063252, + "num_input_tokens_seen": 106709705, + "step": 4947, + "time_per_iteration": 3.517805576324463 + }, + { + "auxiliary_loss_clip": 0.01115985, + "auxiliary_loss_mlp": 0.01084471, + "balance_loss_clip": 1.02478325, + "balance_loss_mlp": 1.00417185, + "epoch": 0.5949618228822221, + "flos": 23981109672960.0, + "grad_norm": 1.915092496098003, + "language_loss": 0.7127229, + "learning_rate": 1.4877563259941433e-06, + "loss": 0.7347275, + "num_input_tokens_seen": 106727560, + "step": 4948, + "time_per_iteration": 2.715579032897949 + }, + { + "auxiliary_loss_clip": 0.01129843, + "auxiliary_loss_mlp": 0.01086091, + "balance_loss_clip": 1.02733064, + "balance_loss_mlp": 1.00569618, + "epoch": 0.5950820657728612, + "flos": 40547491476480.0, + "grad_norm": 2.368288195437131, + "language_loss": 0.67715496, + "learning_rate": 1.4870033762019988e-06, + "loss": 0.69931436, + "num_input_tokens_seen": 106747725, + "step": 4949, + "time_per_iteration": 3.7655816078186035 + }, + { + "auxiliary_loss_clip": 0.01120131, + "auxiliary_loss_mlp": 0.01086354, + "balance_loss_clip": 1.0260148, + "balance_loss_mlp": 1.00600696, + "epoch": 0.5952023086635003, + "flos": 23184467884800.0, + "grad_norm": 1.8702514662732102, + "language_loss": 0.73278493, + "learning_rate": 1.4862505042310334e-06, + "loss": 0.75484979, + "num_input_tokens_seen": 106767010, + "step": 4950, + "time_per_iteration": 2.7064266204833984 + }, + { + "auxiliary_loss_clip": 0.01119039, + "auxiliary_loss_mlp": 0.01084201, + "balance_loss_clip": 1.02651393, + "balance_loss_mlp": 1.00390148, + "epoch": 0.5953225515541394, + "flos": 33653289548160.0, + "grad_norm": 1.5338886840092423, + "language_loss": 0.69710302, + "learning_rate": 1.4854977101954587e-06, + "loss": 0.7191354, + "num_input_tokens_seen": 106789230, + "step": 4951, + "time_per_iteration": 2.8570914268493652 + }, + { + "auxiliary_loss_clip": 0.01127785, + "auxiliary_loss_mlp": 0.01084034, + "balance_loss_clip": 1.02544498, + "balance_loss_mlp": 1.00363958, + "epoch": 0.5954427944447784, + "flos": 24459619680000.0, + "grad_norm": 1.9883627039056315, + "language_loss": 0.85638607, + "learning_rate": 1.4847449942094716e-06, + "loss": 0.87850428, + "num_input_tokens_seen": 106808110, + "step": 4952, + "time_per_iteration": 3.5580525398254395 + }, + { + "auxiliary_loss_clip": 0.01114545, + "auxiliary_loss_mlp": 0.01084493, + "balance_loss_clip": 1.02359104, + "balance_loss_mlp": 1.00424135, + "epoch": 0.5955630373354175, + "flos": 18551848026240.0, + "grad_norm": 1.862709155486809, + "language_loss": 0.86474103, + "learning_rate": 1.4839923563872598e-06, + "loss": 0.88673139, + "num_input_tokens_seen": 106826650, + "step": 4953, + "time_per_iteration": 2.7181966304779053 + }, + { + "auxiliary_loss_clip": 0.01102147, + "auxiliary_loss_mlp": 0.01087091, + "balance_loss_clip": 1.0204953, + "balance_loss_mlp": 1.00664902, + "epoch": 0.5956832802260567, + "flos": 19791699730560.0, + "grad_norm": 1.5864501945782423, + "language_loss": 0.75974214, + "learning_rate": 1.483239796842997e-06, + "loss": 0.78163457, + "num_input_tokens_seen": 106844680, + "step": 4954, + "time_per_iteration": 2.761540651321411 + }, + { + "auxiliary_loss_clip": 0.01091752, + "auxiliary_loss_mlp": 0.01084896, + "balance_loss_clip": 1.02562976, + "balance_loss_mlp": 1.0046916, + "epoch": 0.5958035231166957, + "flos": 19750868945280.0, + "grad_norm": 1.795839153869018, + "language_loss": 0.83919168, + "learning_rate": 1.4824873156908462e-06, + "loss": 0.8609581, + "num_input_tokens_seen": 106862605, + "step": 4955, + "time_per_iteration": 2.701845407485962 + }, + { + "auxiliary_loss_clip": 0.01127852, + "auxiliary_loss_mlp": 0.00873076, + "balance_loss_clip": 1.0269866, + "balance_loss_mlp": 1.00006068, + "epoch": 0.5959237660073348, + "flos": 21652806090240.0, + "grad_norm": 1.6429748554178674, + "language_loss": 0.75475544, + "learning_rate": 1.4817349130449584e-06, + "loss": 0.77476472, + "num_input_tokens_seen": 106882325, + "step": 4956, + "time_per_iteration": 2.7409610748291016 + }, + { + "auxiliary_loss_clip": 0.01125703, + "auxiliary_loss_mlp": 0.01084435, + "balance_loss_clip": 1.02485204, + "balance_loss_mlp": 1.00418329, + "epoch": 0.5960440088979739, + "flos": 21171207513600.0, + "grad_norm": 2.3546035848146976, + "language_loss": 0.8267324, + "learning_rate": 1.4809825890194717e-06, + "loss": 0.8488338, + "num_input_tokens_seen": 106900995, + "step": 4957, + "time_per_iteration": 2.7009119987487793 + }, + { + "auxiliary_loss_clip": 0.01120257, + "auxiliary_loss_mlp": 0.01084523, + "balance_loss_clip": 1.02640939, + "balance_loss_mlp": 1.00427175, + "epoch": 0.596164251788613, + "flos": 14757526753920.0, + "grad_norm": 1.7420208399098975, + "language_loss": 0.77212995, + "learning_rate": 1.4802303437285139e-06, + "loss": 0.79417777, + "num_input_tokens_seen": 106918265, + "step": 4958, + "time_per_iteration": 2.7268829345703125 + }, + { + "auxiliary_loss_clip": 0.01116321, + "auxiliary_loss_mlp": 0.01084068, + "balance_loss_clip": 1.02372432, + "balance_loss_mlp": 1.003721, + "epoch": 0.596284494679252, + "flos": 20485924865280.0, + "grad_norm": 2.275497749057141, + "language_loss": 0.80391192, + "learning_rate": 1.4794781772861994e-06, + "loss": 0.82591581, + "num_input_tokens_seen": 106934760, + "step": 4959, + "time_per_iteration": 2.700208902359009 + }, + { + "auxiliary_loss_clip": 0.01119583, + "auxiliary_loss_mlp": 0.00872919, + "balance_loss_clip": 1.02625895, + "balance_loss_mlp": 1.00006926, + "epoch": 0.5964047375698912, + "flos": 31212262108800.0, + "grad_norm": 2.0488471122959284, + "language_loss": 0.669137, + "learning_rate": 1.4787260898066324e-06, + "loss": 0.689062, + "num_input_tokens_seen": 106954760, + "step": 4960, + "time_per_iteration": 2.7955193519592285 + }, + { + "auxiliary_loss_clip": 0.01135716, + "auxiliary_loss_mlp": 0.01084538, + "balance_loss_clip": 1.02610052, + "balance_loss_mlp": 1.00428677, + "epoch": 0.5965249804605303, + "flos": 27483620855040.0, + "grad_norm": 1.9422048481286198, + "language_loss": 0.86208647, + "learning_rate": 1.4779740814039023e-06, + "loss": 0.88428903, + "num_input_tokens_seen": 106974845, + "step": 4961, + "time_per_iteration": 2.6549324989318848 + }, + { + "auxiliary_loss_clip": 0.01135966, + "auxiliary_loss_mlp": 0.01083128, + "balance_loss_clip": 1.02540362, + "balance_loss_mlp": 1.00278139, + "epoch": 0.5966452233511693, + "flos": 30773936442240.0, + "grad_norm": 2.0077969143391083, + "language_loss": 0.68622082, + "learning_rate": 1.4772221521920894e-06, + "loss": 0.70841175, + "num_input_tokens_seen": 106994870, + "step": 4962, + "time_per_iteration": 2.7154908180236816 + }, + { + "auxiliary_loss_clip": 0.0111326, + "auxiliary_loss_mlp": 0.01085475, + "balance_loss_clip": 1.02208757, + "balance_loss_mlp": 1.00512815, + "epoch": 0.5967654662418085, + "flos": 25481170477440.0, + "grad_norm": 2.5368990553654047, + "language_loss": 0.7431289, + "learning_rate": 1.4764703022852598e-06, + "loss": 0.76511621, + "num_input_tokens_seen": 107015390, + "step": 4963, + "time_per_iteration": 2.739408016204834 + }, + { + "auxiliary_loss_clip": 0.01081069, + "auxiliary_loss_mlp": 0.010835, + "balance_loss_clip": 1.02133453, + "balance_loss_mlp": 1.00329578, + "epoch": 0.5968857091324475, + "flos": 19099126621440.0, + "grad_norm": 2.0995543099112806, + "language_loss": 0.77050024, + "learning_rate": 1.4757185317974696e-06, + "loss": 0.79214585, + "num_input_tokens_seen": 107033775, + "step": 4964, + "time_per_iteration": 2.7759616374969482 + }, + { + "auxiliary_loss_clip": 0.01126147, + "auxiliary_loss_mlp": 0.01084461, + "balance_loss_clip": 1.02444816, + "balance_loss_mlp": 1.00401902, + "epoch": 0.5970059520230866, + "flos": 23692711374720.0, + "grad_norm": 2.2993871205845284, + "language_loss": 0.71494937, + "learning_rate": 1.474966840842761e-06, + "loss": 0.73705542, + "num_input_tokens_seen": 107053355, + "step": 4965, + "time_per_iteration": 2.7246456146240234 + }, + { + "auxiliary_loss_clip": 0.01128618, + "auxiliary_loss_mlp": 0.01084737, + "balance_loss_clip": 1.02627897, + "balance_loss_mlp": 1.00453281, + "epoch": 0.5971261949137258, + "flos": 23185545292800.0, + "grad_norm": 1.8171457378726537, + "language_loss": 0.86839616, + "learning_rate": 1.4742152295351655e-06, + "loss": 0.89052963, + "num_input_tokens_seen": 107072510, + "step": 4966, + "time_per_iteration": 2.7074620723724365 + }, + { + "auxiliary_loss_clip": 0.01125965, + "auxiliary_loss_mlp": 0.00872968, + "balance_loss_clip": 1.02504253, + "balance_loss_mlp": 1.00006127, + "epoch": 0.5972464378043648, + "flos": 20557710195840.0, + "grad_norm": 2.3216568029352707, + "language_loss": 0.6362617, + "learning_rate": 1.4734636979887016e-06, + "loss": 0.65625107, + "num_input_tokens_seen": 107089970, + "step": 4967, + "time_per_iteration": 2.724472761154175 + }, + { + "auxiliary_loss_clip": 0.01109533, + "auxiliary_loss_mlp": 0.01084249, + "balance_loss_clip": 1.02459264, + "balance_loss_mlp": 1.00390232, + "epoch": 0.5973666806950039, + "flos": 29387030457600.0, + "grad_norm": 2.7836784325572026, + "language_loss": 0.89903134, + "learning_rate": 1.4727122463173755e-06, + "loss": 0.92096919, + "num_input_tokens_seen": 107108500, + "step": 4968, + "time_per_iteration": 2.7818105220794678 + }, + { + "auxiliary_loss_clip": 0.01115973, + "auxiliary_loss_mlp": 0.01085065, + "balance_loss_clip": 1.0239284, + "balance_loss_mlp": 1.00481308, + "epoch": 0.597486923585643, + "flos": 22273522041600.0, + "grad_norm": 1.8399915402698173, + "language_loss": 0.64167821, + "learning_rate": 1.471960874635183e-06, + "loss": 0.66368854, + "num_input_tokens_seen": 107128060, + "step": 4969, + "time_per_iteration": 2.7517683506011963 + }, + { + "auxiliary_loss_clip": 0.01117705, + "auxiliary_loss_mlp": 0.01084141, + "balance_loss_clip": 1.02459598, + "balance_loss_mlp": 1.003842, + "epoch": 0.5976071664762821, + "flos": 13772461196160.0, + "grad_norm": 2.1837902964907734, + "language_loss": 0.70896733, + "learning_rate": 1.4712095830561055e-06, + "loss": 0.73098576, + "num_input_tokens_seen": 107146550, + "step": 4970, + "time_per_iteration": 2.677116632461548 + }, + { + "auxiliary_loss_clip": 0.01119911, + "auxiliary_loss_mlp": 0.01084215, + "balance_loss_clip": 1.02552497, + "balance_loss_mlp": 1.00391603, + "epoch": 0.5977274093669211, + "flos": 19098623831040.0, + "grad_norm": 1.7827515817887472, + "language_loss": 0.81347001, + "learning_rate": 1.4704583716941147e-06, + "loss": 0.83551133, + "num_input_tokens_seen": 107165415, + "step": 4971, + "time_per_iteration": 3.677616596221924 + }, + { + "auxiliary_loss_clip": 0.01129246, + "auxiliary_loss_mlp": 0.01085134, + "balance_loss_clip": 1.02738249, + "balance_loss_mlp": 1.00473928, + "epoch": 0.5978476522575603, + "flos": 20376002269440.0, + "grad_norm": 1.6845363050452493, + "language_loss": 0.72205693, + "learning_rate": 1.4697072406631672e-06, + "loss": 0.74420071, + "num_input_tokens_seen": 107185320, + "step": 4972, + "time_per_iteration": 3.64750075340271 + }, + { + "auxiliary_loss_clip": 0.01099226, + "auxiliary_loss_mlp": 0.010845, + "balance_loss_clip": 1.0241127, + "balance_loss_mlp": 1.00420094, + "epoch": 0.5979678951481994, + "flos": 29023147728000.0, + "grad_norm": 2.461463895125648, + "language_loss": 0.72921729, + "learning_rate": 1.4689561900772097e-06, + "loss": 0.75105453, + "num_input_tokens_seen": 107205380, + "step": 4973, + "time_per_iteration": 2.8336124420166016 + }, + { + "auxiliary_loss_clip": 0.0111707, + "auxiliary_loss_mlp": 0.01084647, + "balance_loss_clip": 1.02468371, + "balance_loss_mlp": 1.00434721, + "epoch": 0.5980881380388384, + "flos": 17967689141760.0, + "grad_norm": 3.0208142982070796, + "language_loss": 0.72196722, + "learning_rate": 1.4682052200501758e-06, + "loss": 0.74398446, + "num_input_tokens_seen": 107222585, + "step": 4974, + "time_per_iteration": 3.636943817138672 + }, + { + "auxiliary_loss_clip": 0.01135775, + "auxiliary_loss_mlp": 0.0108437, + "balance_loss_clip": 1.02571762, + "balance_loss_mlp": 1.00411856, + "epoch": 0.5982083809294776, + "flos": 22962827013120.0, + "grad_norm": 2.3052183554245076, + "language_loss": 0.79872972, + "learning_rate": 1.4674543306959876e-06, + "loss": 0.8209312, + "num_input_tokens_seen": 107242055, + "step": 4975, + "time_per_iteration": 2.660421848297119 + }, + { + "auxiliary_loss_clip": 0.01118005, + "auxiliary_loss_mlp": 0.01084086, + "balance_loss_clip": 1.02459383, + "balance_loss_mlp": 1.00383461, + "epoch": 0.5983286238201166, + "flos": 20991941712000.0, + "grad_norm": 2.1944826966863107, + "language_loss": 0.84162223, + "learning_rate": 1.4667035221285535e-06, + "loss": 0.86364305, + "num_input_tokens_seen": 107259695, + "step": 4976, + "time_per_iteration": 2.690751552581787 + }, + { + "auxiliary_loss_clip": 0.01119785, + "auxiliary_loss_mlp": 0.01084313, + "balance_loss_clip": 1.0242033, + "balance_loss_mlp": 1.00410879, + "epoch": 0.5984488667107557, + "flos": 28183448511360.0, + "grad_norm": 1.6116714819159141, + "language_loss": 0.74135047, + "learning_rate": 1.4659527944617715e-06, + "loss": 0.76339138, + "num_input_tokens_seen": 107279640, + "step": 4977, + "time_per_iteration": 2.7461166381835938 + }, + { + "auxiliary_loss_clip": 0.01087914, + "auxiliary_loss_mlp": 0.01084484, + "balance_loss_clip": 1.02182865, + "balance_loss_mlp": 1.00428021, + "epoch": 0.5985691096013949, + "flos": 16471794314880.0, + "grad_norm": 2.433433444284681, + "language_loss": 0.7589345, + "learning_rate": 1.465202147809526e-06, + "loss": 0.78065848, + "num_input_tokens_seen": 107298135, + "step": 4978, + "time_per_iteration": 3.6964266300201416 + }, + { + "auxiliary_loss_clip": 0.01137122, + "auxiliary_loss_mlp": 0.01084566, + "balance_loss_clip": 1.02635765, + "balance_loss_mlp": 1.00421846, + "epoch": 0.5986893524920339, + "flos": 26719046933760.0, + "grad_norm": 1.8541971935839332, + "language_loss": 0.76475954, + "learning_rate": 1.4644515822856888e-06, + "loss": 0.78697646, + "num_input_tokens_seen": 107316570, + "step": 4979, + "time_per_iteration": 2.631671667098999 + }, + { + "auxiliary_loss_clip": 0.01093732, + "auxiliary_loss_mlp": 0.01079365, + "balance_loss_clip": 1.02234435, + "balance_loss_mlp": 1.00040078, + "epoch": 0.598809595382673, + "flos": 61608061100160.0, + "grad_norm": 0.7504423500390587, + "language_loss": 0.56593376, + "learning_rate": 1.4637010980041215e-06, + "loss": 0.58766472, + "num_input_tokens_seen": 107378680, + "step": 4980, + "time_per_iteration": 3.3481738567352295 + }, + { + "auxiliary_loss_clip": 0.01136642, + "auxiliary_loss_mlp": 0.0108524, + "balance_loss_clip": 1.02647734, + "balance_loss_mlp": 1.00489247, + "epoch": 0.5989298382733121, + "flos": 11801719549440.0, + "grad_norm": 2.074784895558278, + "language_loss": 0.89592373, + "learning_rate": 1.4629506950786707e-06, + "loss": 0.9181425, + "num_input_tokens_seen": 107394860, + "step": 4981, + "time_per_iteration": 2.573425054550171 + }, + { + "auxiliary_loss_clip": 0.01117682, + "auxiliary_loss_mlp": 0.01079265, + "balance_loss_clip": 1.02165067, + "balance_loss_mlp": 1.00030065, + "epoch": 0.5990500811639512, + "flos": 60025800021120.0, + "grad_norm": 0.8106129069303204, + "language_loss": 0.56109118, + "learning_rate": 1.4622003736231733e-06, + "loss": 0.58306062, + "num_input_tokens_seen": 107453850, + "step": 4982, + "time_per_iteration": 3.232814311981201 + }, + { + "auxiliary_loss_clip": 0.01122229, + "auxiliary_loss_mlp": 0.01083627, + "balance_loss_clip": 1.02258086, + "balance_loss_mlp": 1.00332808, + "epoch": 0.5991703240545903, + "flos": 18222726683520.0, + "grad_norm": 1.9137330126103647, + "language_loss": 0.80379689, + "learning_rate": 1.461450133751451e-06, + "loss": 0.82585549, + "num_input_tokens_seen": 107471920, + "step": 4983, + "time_per_iteration": 2.646812677383423 + }, + { + "auxiliary_loss_clip": 0.01110891, + "auxiliary_loss_mlp": 0.0108398, + "balance_loss_clip": 1.02543759, + "balance_loss_mlp": 1.00363255, + "epoch": 0.5992905669452293, + "flos": 27709894581120.0, + "grad_norm": 1.8310281278220883, + "language_loss": 0.75917143, + "learning_rate": 1.4606999755773153e-06, + "loss": 0.78112012, + "num_input_tokens_seen": 107493125, + "step": 4984, + "time_per_iteration": 2.7801754474639893 + }, + { + "auxiliary_loss_clip": 0.01136223, + "auxiliary_loss_mlp": 0.01084448, + "balance_loss_clip": 1.02622962, + "balance_loss_mlp": 1.00424409, + "epoch": 0.5994108098358685, + "flos": 20449008662400.0, + "grad_norm": 1.7288276808423957, + "language_loss": 0.82118082, + "learning_rate": 1.4599498992145643e-06, + "loss": 0.84338748, + "num_input_tokens_seen": 107513150, + "step": 4985, + "time_per_iteration": 2.6380138397216797 + }, + { + "auxiliary_loss_clip": 0.01118416, + "auxiliary_loss_mlp": 0.00872867, + "balance_loss_clip": 1.02452862, + "balance_loss_mlp": 1.00001717, + "epoch": 0.5995310527265075, + "flos": 22269966595200.0, + "grad_norm": 2.1432853750648544, + "language_loss": 0.70831037, + "learning_rate": 1.4591999047769846e-06, + "loss": 0.7282232, + "num_input_tokens_seen": 107532005, + "step": 4986, + "time_per_iteration": 2.735569953918457 + }, + { + "auxiliary_loss_clip": 0.01091833, + "auxiliary_loss_mlp": 0.01084719, + "balance_loss_clip": 1.0237751, + "balance_loss_mlp": 1.00446725, + "epoch": 0.5996512956171466, + "flos": 18916951818240.0, + "grad_norm": 1.7547120668691485, + "language_loss": 0.7566191, + "learning_rate": 1.4584499923783486e-06, + "loss": 0.77838463, + "num_input_tokens_seen": 107550585, + "step": 4987, + "time_per_iteration": 2.812291383743286 + }, + { + "auxiliary_loss_clip": 0.01116588, + "auxiliary_loss_mlp": 0.010856, + "balance_loss_clip": 1.02448523, + "balance_loss_mlp": 1.005301, + "epoch": 0.5997715385077858, + "flos": 15370916330880.0, + "grad_norm": 2.1589945831832953, + "language_loss": 0.76004875, + "learning_rate": 1.457700162132419e-06, + "loss": 0.78207064, + "num_input_tokens_seen": 107567575, + "step": 4988, + "time_per_iteration": 2.7021193504333496 + }, + { + "auxiliary_loss_clip": 0.01089189, + "auxiliary_loss_mlp": 0.01084569, + "balance_loss_clip": 1.02126241, + "balance_loss_mlp": 1.00431681, + "epoch": 0.5998917813984248, + "flos": 25264844818560.0, + "grad_norm": 3.4472704488623336, + "language_loss": 0.72533566, + "learning_rate": 1.4569504141529433e-06, + "loss": 0.74707323, + "num_input_tokens_seen": 107585410, + "step": 4989, + "time_per_iteration": 2.8860929012298584 + }, + { + "auxiliary_loss_clip": 0.01122173, + "auxiliary_loss_mlp": 0.01085712, + "balance_loss_clip": 1.02266407, + "balance_loss_mlp": 1.00531697, + "epoch": 0.6000120242890639, + "flos": 22054502862720.0, + "grad_norm": 1.8848403336230022, + "language_loss": 0.71731883, + "learning_rate": 1.456200748553658e-06, + "loss": 0.73939764, + "num_input_tokens_seen": 107603405, + "step": 4990, + "time_per_iteration": 2.667354106903076 + }, + { + "auxiliary_loss_clip": 0.0113749, + "auxiliary_loss_mlp": 0.01085253, + "balance_loss_clip": 1.02704823, + "balance_loss_mlp": 1.00490594, + "epoch": 0.600132267179703, + "flos": 29863421562240.0, + "grad_norm": 1.4752081967013706, + "language_loss": 0.78621334, + "learning_rate": 1.455451165448287e-06, + "loss": 0.8084408, + "num_input_tokens_seen": 107626060, + "step": 4991, + "time_per_iteration": 2.7665348052978516 + }, + { + "auxiliary_loss_clip": 0.01111274, + "auxiliary_loss_mlp": 0.0108509, + "balance_loss_clip": 1.0247519, + "balance_loss_mlp": 1.00474262, + "epoch": 0.6002525100703421, + "flos": 25045358762880.0, + "grad_norm": 2.2866264197716295, + "language_loss": 0.73859453, + "learning_rate": 1.4547016649505407e-06, + "loss": 0.76055813, + "num_input_tokens_seen": 107644070, + "step": 4992, + "time_per_iteration": 2.727717161178589 + }, + { + "auxiliary_loss_clip": 0.01109165, + "auxiliary_loss_mlp": 0.01085911, + "balance_loss_clip": 1.02381444, + "balance_loss_mlp": 1.00561142, + "epoch": 0.6003727529609811, + "flos": 20849592113280.0, + "grad_norm": 2.463086221169553, + "language_loss": 0.84791768, + "learning_rate": 1.4539522471741193e-06, + "loss": 0.86986852, + "num_input_tokens_seen": 107661495, + "step": 4993, + "time_per_iteration": 308.9917616844177 + }, + { + "auxiliary_loss_clip": 0.01126315, + "auxiliary_loss_mlp": 0.01084764, + "balance_loss_clip": 1.02501154, + "balance_loss_mlp": 1.00422573, + "epoch": 0.6004929958516203, + "flos": 15594604277760.0, + "grad_norm": 2.034910319246162, + "language_loss": 0.70505208, + "learning_rate": 1.4532029122327067e-06, + "loss": 0.72716284, + "num_input_tokens_seen": 107678280, + "step": 4994, + "time_per_iteration": 2.621676445007324 + }, + { + "auxiliary_loss_clip": 0.01101337, + "auxiliary_loss_mlp": 0.01084904, + "balance_loss_clip": 1.02077484, + "balance_loss_mlp": 1.00460458, + "epoch": 0.6006132387422594, + "flos": 21763267390080.0, + "grad_norm": 1.8310251263646795, + "language_loss": 0.75534093, + "learning_rate": 1.4524536602399783e-06, + "loss": 0.77720332, + "num_input_tokens_seen": 107697370, + "step": 4995, + "time_per_iteration": 2.775132656097412 + }, + { + "auxiliary_loss_clip": 0.01115992, + "auxiliary_loss_mlp": 0.01085473, + "balance_loss_clip": 1.02563453, + "balance_loss_mlp": 1.00522172, + "epoch": 0.6007334816328984, + "flos": 22858542852480.0, + "grad_norm": 1.8556970848298349, + "language_loss": 0.77357656, + "learning_rate": 1.4517044913095938e-06, + "loss": 0.79559124, + "num_input_tokens_seen": 107717790, + "step": 4996, + "time_per_iteration": 3.6409974098205566 + }, + { + "auxiliary_loss_clip": 0.01126599, + "auxiliary_loss_mlp": 0.01084219, + "balance_loss_clip": 1.02581668, + "balance_loss_mlp": 1.00391996, + "epoch": 0.6008537245235376, + "flos": 28324577047680.0, + "grad_norm": 1.5415746012038973, + "language_loss": 0.8119204, + "learning_rate": 1.4509554055552022e-06, + "loss": 0.8340286, + "num_input_tokens_seen": 107738020, + "step": 4997, + "time_per_iteration": 2.732436180114746 + }, + { + "auxiliary_loss_clip": 0.01119602, + "auxiliary_loss_mlp": 0.01084755, + "balance_loss_clip": 1.02658212, + "balance_loss_mlp": 1.00440824, + "epoch": 0.6009739674141766, + "flos": 20886113266560.0, + "grad_norm": 2.9234610005119843, + "language_loss": 0.83597171, + "learning_rate": 1.450206403090439e-06, + "loss": 0.8580153, + "num_input_tokens_seen": 107756215, + "step": 4998, + "time_per_iteration": 3.6484978199005127 + }, + { + "auxiliary_loss_clip": 0.01126719, + "auxiliary_loss_mlp": 0.01083547, + "balance_loss_clip": 1.02662206, + "balance_loss_mlp": 1.00334334, + "epoch": 0.6010942103048157, + "flos": 20481004702080.0, + "grad_norm": 2.180005586734595, + "language_loss": 0.86269975, + "learning_rate": 1.4494574840289274e-06, + "loss": 0.88480246, + "num_input_tokens_seen": 107773330, + "step": 4999, + "time_per_iteration": 2.6725943088531494 + }, + { + "auxiliary_loss_clip": 0.01127542, + "auxiliary_loss_mlp": 0.01086019, + "balance_loss_clip": 1.02502942, + "balance_loss_mlp": 1.00557661, + "epoch": 0.6012144531954549, + "flos": 23805973935360.0, + "grad_norm": 2.11805818301597, + "language_loss": 0.74010539, + "learning_rate": 1.4487086484842782e-06, + "loss": 0.76224101, + "num_input_tokens_seen": 107791975, + "step": 5000, + "time_per_iteration": 3.7215495109558105 + }, + { + "auxiliary_loss_clip": 0.01137122, + "auxiliary_loss_mlp": 0.0108522, + "balance_loss_clip": 1.02692485, + "balance_loss_mlp": 1.00487304, + "epoch": 0.6013346960860939, + "flos": 18988378012800.0, + "grad_norm": 2.359056837759892, + "language_loss": 0.60100353, + "learning_rate": 1.4479598965700878e-06, + "loss": 0.623227, + "num_input_tokens_seen": 107809240, + "step": 5001, + "time_per_iteration": 2.609926700592041 + }, + { + "auxiliary_loss_clip": 0.01108535, + "auxiliary_loss_mlp": 0.01084401, + "balance_loss_clip": 1.02455831, + "balance_loss_mlp": 1.00414896, + "epoch": 0.601454938976733, + "flos": 24025316336640.0, + "grad_norm": 2.365694660363541, + "language_loss": 0.69266647, + "learning_rate": 1.4472112283999427e-06, + "loss": 0.71459579, + "num_input_tokens_seen": 107827895, + "step": 5002, + "time_per_iteration": 2.9079575538635254 + }, + { + "auxiliary_loss_clip": 0.01120576, + "auxiliary_loss_mlp": 0.01084446, + "balance_loss_clip": 1.02147603, + "balance_loss_mlp": 1.0041945, + "epoch": 0.6015751818673721, + "flos": 26427129102720.0, + "grad_norm": 2.2261608252120397, + "language_loss": 0.69471246, + "learning_rate": 1.4464626440874143e-06, + "loss": 0.71676266, + "num_input_tokens_seen": 107847010, + "step": 5003, + "time_per_iteration": 3.660754680633545 + }, + { + "auxiliary_loss_clip": 0.01086147, + "auxiliary_loss_mlp": 0.0108378, + "balance_loss_clip": 1.02463651, + "balance_loss_mlp": 1.00343263, + "epoch": 0.6016954247580112, + "flos": 13115260005120.0, + "grad_norm": 2.2654485573653496, + "language_loss": 0.74090952, + "learning_rate": 1.4457141437460636e-06, + "loss": 0.76260877, + "num_input_tokens_seen": 107864235, + "step": 5004, + "time_per_iteration": 2.7206802368164062 + }, + { + "auxiliary_loss_clip": 0.01115605, + "auxiliary_loss_mlp": 0.01084223, + "balance_loss_clip": 1.02361512, + "balance_loss_mlp": 1.00378013, + "epoch": 0.6018156676486502, + "flos": 23768447201280.0, + "grad_norm": 1.633514072231762, + "language_loss": 0.73114729, + "learning_rate": 1.444965727489436e-06, + "loss": 0.75314558, + "num_input_tokens_seen": 107883680, + "step": 5005, + "time_per_iteration": 2.7836220264434814 + }, + { + "auxiliary_loss_clip": 0.01109223, + "auxiliary_loss_mlp": 0.01084869, + "balance_loss_clip": 1.02415562, + "balance_loss_mlp": 1.00466466, + "epoch": 0.6019359105392894, + "flos": 26469360518400.0, + "grad_norm": 1.9053729052979247, + "language_loss": 0.6328212, + "learning_rate": 1.444217395431066e-06, + "loss": 0.65476215, + "num_input_tokens_seen": 107906220, + "step": 5006, + "time_per_iteration": 2.849472999572754 + }, + { + "auxiliary_loss_clip": 0.01090002, + "auxiliary_loss_mlp": 0.01078989, + "balance_loss_clip": 1.01845336, + "balance_loss_mlp": 1.00002468, + "epoch": 0.6020561534299285, + "flos": 69190849728000.0, + "grad_norm": 0.7933098455057553, + "language_loss": 0.55863106, + "learning_rate": 1.4434691476844755e-06, + "loss": 0.58032095, + "num_input_tokens_seen": 107967195, + "step": 5007, + "time_per_iteration": 3.3528690338134766 + }, + { + "auxiliary_loss_clip": 0.0111684, + "auxiliary_loss_mlp": 0.0108426, + "balance_loss_clip": 1.02483344, + "balance_loss_mlp": 1.00405574, + "epoch": 0.6021763963205675, + "flos": 21835304115840.0, + "grad_norm": 2.5968452626533813, + "language_loss": 0.66783535, + "learning_rate": 1.4427209843631729e-06, + "loss": 0.68984634, + "num_input_tokens_seen": 107984245, + "step": 5008, + "time_per_iteration": 2.7254786491394043 + }, + { + "auxiliary_loss_clip": 0.011361, + "auxiliary_loss_mlp": 0.00872914, + "balance_loss_clip": 1.02666366, + "balance_loss_mlp": 1.00003493, + "epoch": 0.6022966392112067, + "flos": 26578636669440.0, + "grad_norm": 1.778126090059364, + "language_loss": 0.81490445, + "learning_rate": 1.4419729055806534e-06, + "loss": 0.83499455, + "num_input_tokens_seen": 108003680, + "step": 5009, + "time_per_iteration": 2.751781463623047 + }, + { + "auxiliary_loss_clip": 0.01117741, + "auxiliary_loss_mlp": 0.00873017, + "balance_loss_clip": 1.02644217, + "balance_loss_mlp": 1.00002563, + "epoch": 0.6024168821018457, + "flos": 20703722981760.0, + "grad_norm": 1.689093355033531, + "language_loss": 0.82346219, + "learning_rate": 1.441224911450401e-06, + "loss": 0.84336978, + "num_input_tokens_seen": 108019635, + "step": 5010, + "time_per_iteration": 2.7192559242248535 + }, + { + "auxiliary_loss_clip": 0.01130294, + "auxiliary_loss_mlp": 0.01083471, + "balance_loss_clip": 1.02796245, + "balance_loss_mlp": 1.00326705, + "epoch": 0.6025371249924848, + "flos": 24680973242880.0, + "grad_norm": 1.7009943732572443, + "language_loss": 0.82055938, + "learning_rate": 1.4404770020858851e-06, + "loss": 0.84269702, + "num_input_tokens_seen": 108039120, + "step": 5011, + "time_per_iteration": 2.7036995887756348 + }, + { + "auxiliary_loss_clip": 0.01118502, + "auxiliary_loss_mlp": 0.01084986, + "balance_loss_clip": 1.02387452, + "balance_loss_mlp": 1.0047344, + "epoch": 0.602657367883124, + "flos": 25955801815680.0, + "grad_norm": 1.5005795699709703, + "language_loss": 0.86022747, + "learning_rate": 1.439729177600563e-06, + "loss": 0.88226235, + "num_input_tokens_seen": 108059615, + "step": 5012, + "time_per_iteration": 2.653050661087036 + }, + { + "auxiliary_loss_clip": 0.01126653, + "auxiliary_loss_mlp": 0.01085193, + "balance_loss_clip": 1.02576089, + "balance_loss_mlp": 1.00489378, + "epoch": 0.602777610773763, + "flos": 16690633925760.0, + "grad_norm": 2.1050967989068865, + "language_loss": 0.7275902, + "learning_rate": 1.4389814381078793e-06, + "loss": 0.74970871, + "num_input_tokens_seen": 108078855, + "step": 5013, + "time_per_iteration": 2.7302565574645996 + }, + { + "auxiliary_loss_clip": 0.01061189, + "auxiliary_loss_mlp": 0.0108598, + "balance_loss_clip": 1.02146995, + "balance_loss_mlp": 1.00568032, + "epoch": 0.6028978536644021, + "flos": 13334243270400.0, + "grad_norm": 2.077668989968645, + "language_loss": 0.80057389, + "learning_rate": 1.438233783721265e-06, + "loss": 0.82204556, + "num_input_tokens_seen": 108095020, + "step": 5014, + "time_per_iteration": 3.183410882949829 + }, + { + "auxiliary_loss_clip": 0.01111441, + "auxiliary_loss_mlp": 0.01085496, + "balance_loss_clip": 1.02221072, + "balance_loss_mlp": 1.0052923, + "epoch": 0.6030180965550412, + "flos": 19644825018240.0, + "grad_norm": 1.9008161966339334, + "language_loss": 0.77880156, + "learning_rate": 1.43748621455414e-06, + "loss": 0.80077094, + "num_input_tokens_seen": 108111455, + "step": 5015, + "time_per_iteration": 2.9864957332611084 + }, + { + "auxiliary_loss_clip": 0.01116219, + "auxiliary_loss_mlp": 0.01085063, + "balance_loss_clip": 1.02450597, + "balance_loss_mlp": 1.00471652, + "epoch": 0.6031383394456803, + "flos": 14458390289280.0, + "grad_norm": 2.489464620500146, + "language_loss": 0.81051302, + "learning_rate": 1.4367387307199082e-06, + "loss": 0.83252579, + "num_input_tokens_seen": 108128305, + "step": 5016, + "time_per_iteration": 2.7254507541656494 + }, + { + "auxiliary_loss_clip": 0.01128402, + "auxiliary_loss_mlp": 0.0108457, + "balance_loss_clip": 1.02637327, + "balance_loss_mlp": 1.00431871, + "epoch": 0.6032585823363193, + "flos": 13917791623680.0, + "grad_norm": 1.7594294578059877, + "language_loss": 0.82322901, + "learning_rate": 1.4359913323319632e-06, + "loss": 0.84535873, + "num_input_tokens_seen": 108145475, + "step": 5017, + "time_per_iteration": 2.6468381881713867 + }, + { + "auxiliary_loss_clip": 0.01090889, + "auxiliary_loss_mlp": 0.01085231, + "balance_loss_clip": 1.02335238, + "balance_loss_mlp": 1.00493193, + "epoch": 0.6033788252269584, + "flos": 24353252530560.0, + "grad_norm": 2.192314251655916, + "language_loss": 0.77904665, + "learning_rate": 1.4352440195036847e-06, + "loss": 0.80080783, + "num_input_tokens_seen": 108165650, + "step": 5018, + "time_per_iteration": 2.8602817058563232 + }, + { + "auxiliary_loss_clip": 0.01065156, + "auxiliary_loss_mlp": 0.01084754, + "balance_loss_clip": 1.02220416, + "balance_loss_mlp": 1.0045023, + "epoch": 0.6034990681175976, + "flos": 25521247077120.0, + "grad_norm": 1.4755422538061391, + "language_loss": 0.79729581, + "learning_rate": 1.4344967923484395e-06, + "loss": 0.81879491, + "num_input_tokens_seen": 108187620, + "step": 5019, + "time_per_iteration": 2.8474209308624268 + }, + { + "auxiliary_loss_clip": 0.01126796, + "auxiliary_loss_mlp": 0.0108499, + "balance_loss_clip": 1.02609384, + "balance_loss_mlp": 1.00473845, + "epoch": 0.6036193110082366, + "flos": 25958387594880.0, + "grad_norm": 1.8455318994628185, + "language_loss": 0.72604209, + "learning_rate": 1.433749650979581e-06, + "loss": 0.74816, + "num_input_tokens_seen": 108207605, + "step": 5020, + "time_per_iteration": 2.7265608310699463 + }, + { + "auxiliary_loss_clip": 0.01091656, + "auxiliary_loss_mlp": 0.01084439, + "balance_loss_clip": 1.02335715, + "balance_loss_mlp": 1.00413954, + "epoch": 0.6037395538988757, + "flos": 25593427457280.0, + "grad_norm": 1.8105747757372816, + "language_loss": 0.68115163, + "learning_rate": 1.433002595510451e-06, + "loss": 0.70291257, + "num_input_tokens_seen": 108226385, + "step": 5021, + "time_per_iteration": 4.110980272293091 + }, + { + "auxiliary_loss_clip": 0.01118051, + "auxiliary_loss_mlp": 0.00872978, + "balance_loss_clip": 1.02419686, + "balance_loss_mlp": 1.00003374, + "epoch": 0.6038597967895148, + "flos": 17816253402240.0, + "grad_norm": 1.7128548386105698, + "language_loss": 0.71962124, + "learning_rate": 1.4322556260543757e-06, + "loss": 0.73953152, + "num_input_tokens_seen": 108242960, + "step": 5022, + "time_per_iteration": 2.860377550125122 + }, + { + "auxiliary_loss_clip": 0.01092527, + "auxiliary_loss_mlp": 0.01078741, + "balance_loss_clip": 1.02082968, + "balance_loss_mlp": 0.99977702, + "epoch": 0.6039800396801539, + "flos": 65169213235200.0, + "grad_norm": 0.8935076054949462, + "language_loss": 0.62717724, + "learning_rate": 1.4315087427246703e-06, + "loss": 0.6488899, + "num_input_tokens_seen": 108296785, + "step": 5023, + "time_per_iteration": 4.303269386291504 + }, + { + "auxiliary_loss_clip": 0.01117422, + "auxiliary_loss_mlp": 0.0107885, + "balance_loss_clip": 1.02143967, + "balance_loss_mlp": 0.99988574, + "epoch": 0.604100282570793, + "flos": 67386409073280.0, + "grad_norm": 0.8715373693156072, + "language_loss": 0.58524179, + "learning_rate": 1.4307619456346372e-06, + "loss": 0.60720456, + "num_input_tokens_seen": 108341090, + "step": 5024, + "time_per_iteration": 3.0497937202453613 + }, + { + "auxiliary_loss_clip": 0.01127221, + "auxiliary_loss_mlp": 0.01083757, + "balance_loss_clip": 1.02505422, + "balance_loss_mlp": 1.00350475, + "epoch": 0.6042205254614321, + "flos": 35297495631360.0, + "grad_norm": 1.8945326707320986, + "language_loss": 0.74028021, + "learning_rate": 1.430015234897564e-06, + "loss": 0.76238996, + "num_input_tokens_seen": 108364370, + "step": 5025, + "time_per_iteration": 3.7973506450653076 + }, + { + "auxiliary_loss_clip": 0.01136297, + "auxiliary_loss_mlp": 0.00872887, + "balance_loss_clip": 1.02627885, + "balance_loss_mlp": 1.00010002, + "epoch": 0.6043407683520712, + "flos": 45658262206080.0, + "grad_norm": 1.5584597046429993, + "language_loss": 0.66170102, + "learning_rate": 1.4292686106267274e-06, + "loss": 0.68179286, + "num_input_tokens_seen": 108387220, + "step": 5026, + "time_per_iteration": 2.8846938610076904 + }, + { + "auxiliary_loss_clip": 0.01128753, + "auxiliary_loss_mlp": 0.01084141, + "balance_loss_clip": 1.02642155, + "balance_loss_mlp": 1.00384164, + "epoch": 0.6044610112427102, + "flos": 16180020138240.0, + "grad_norm": 1.6445371288056954, + "language_loss": 0.77321863, + "learning_rate": 1.4285220729353876e-06, + "loss": 0.79534757, + "num_input_tokens_seen": 108405760, + "step": 5027, + "time_per_iteration": 2.767275333404541 + }, + { + "auxiliary_loss_clip": 0.01119218, + "auxiliary_loss_mlp": 0.01083634, + "balance_loss_clip": 1.02534485, + "balance_loss_mlp": 1.00342989, + "epoch": 0.6045812541333494, + "flos": 13804062186240.0, + "grad_norm": 1.8189704885979534, + "language_loss": 0.7776134, + "learning_rate": 1.4277756219367957e-06, + "loss": 0.79964197, + "num_input_tokens_seen": 108422785, + "step": 5028, + "time_per_iteration": 2.7219653129577637 + }, + { + "auxiliary_loss_clip": 0.01110153, + "auxiliary_loss_mlp": 0.01084217, + "balance_loss_clip": 1.02558196, + "balance_loss_mlp": 1.00377476, + "epoch": 0.6047014970239885, + "flos": 19975059682560.0, + "grad_norm": 2.2068007520061266, + "language_loss": 0.79693633, + "learning_rate": 1.4270292577441864e-06, + "loss": 0.81888002, + "num_input_tokens_seen": 108442290, + "step": 5029, + "time_per_iteration": 3.788318395614624 + }, + { + "auxiliary_loss_clip": 0.01127389, + "auxiliary_loss_mlp": 0.01083789, + "balance_loss_clip": 1.02520514, + "balance_loss_mlp": 1.00344205, + "epoch": 0.6048217399146275, + "flos": 25337097025920.0, + "grad_norm": 1.6916078323522925, + "language_loss": 0.7180835, + "learning_rate": 1.4262829804707836e-06, + "loss": 0.74019527, + "num_input_tokens_seen": 108464280, + "step": 5030, + "time_per_iteration": 2.708200216293335 + }, + { + "auxiliary_loss_clip": 0.01128534, + "auxiliary_loss_mlp": 0.01084806, + "balance_loss_clip": 1.02628601, + "balance_loss_mlp": 1.00445914, + "epoch": 0.6049419828052667, + "flos": 26030819370240.0, + "grad_norm": 1.5150472185229387, + "language_loss": 0.69857508, + "learning_rate": 1.4255367902297958e-06, + "loss": 0.72070849, + "num_input_tokens_seen": 108485610, + "step": 5031, + "time_per_iteration": 2.7376763820648193 + }, + { + "auxiliary_loss_clip": 0.01136658, + "auxiliary_loss_mlp": 0.01084231, + "balance_loss_clip": 1.02659023, + "balance_loss_mlp": 1.00402689, + "epoch": 0.6050622256959057, + "flos": 14648106948480.0, + "grad_norm": 2.0715057730693007, + "language_loss": 0.79088676, + "learning_rate": 1.4247906871344215e-06, + "loss": 0.81309563, + "num_input_tokens_seen": 108501005, + "step": 5032, + "time_per_iteration": 2.6759283542633057 + }, + { + "auxiliary_loss_clip": 0.01117132, + "auxiliary_loss_mlp": 0.01084778, + "balance_loss_clip": 1.02428317, + "balance_loss_mlp": 1.00452638, + "epoch": 0.6051824685865448, + "flos": 23331450337920.0, + "grad_norm": 2.3918435105326377, + "language_loss": 0.75357074, + "learning_rate": 1.4240446712978415e-06, + "loss": 0.77558982, + "num_input_tokens_seen": 108519990, + "step": 5033, + "time_per_iteration": 2.7448503971099854 + }, + { + "auxiliary_loss_clip": 0.01128286, + "auxiliary_loss_mlp": 0.01085454, + "balance_loss_clip": 1.0265615, + "balance_loss_mlp": 1.00505948, + "epoch": 0.605302711477184, + "flos": 27563307177600.0, + "grad_norm": 2.007737369783937, + "language_loss": 0.74809098, + "learning_rate": 1.423298742833227e-06, + "loss": 0.77022839, + "num_input_tokens_seen": 108538650, + "step": 5034, + "time_per_iteration": 2.7583529949188232 + }, + { + "auxiliary_loss_clip": 0.01109457, + "auxiliary_loss_mlp": 0.01085446, + "balance_loss_clip": 1.02451301, + "balance_loss_mlp": 1.00509858, + "epoch": 0.605422954367823, + "flos": 15154698412800.0, + "grad_norm": 2.075257765429043, + "language_loss": 0.71552193, + "learning_rate": 1.4225529018537352e-06, + "loss": 0.73747098, + "num_input_tokens_seen": 108554155, + "step": 5035, + "time_per_iteration": 2.7942264080047607 + }, + { + "auxiliary_loss_clip": 0.01137438, + "auxiliary_loss_mlp": 0.01085747, + "balance_loss_clip": 1.02729321, + "balance_loss_mlp": 1.00549531, + "epoch": 0.6055431972584621, + "flos": 27673912131840.0, + "grad_norm": 1.6135540110514897, + "language_loss": 0.77730566, + "learning_rate": 1.4218071484725082e-06, + "loss": 0.79953754, + "num_input_tokens_seen": 108576275, + "step": 5036, + "time_per_iteration": 2.7409555912017822 + }, + { + "auxiliary_loss_clip": 0.01110822, + "auxiliary_loss_mlp": 0.01084716, + "balance_loss_clip": 1.02460444, + "balance_loss_mlp": 1.00436902, + "epoch": 0.6056634401491012, + "flos": 19387489006080.0, + "grad_norm": 1.922378256554942, + "language_loss": 0.76028919, + "learning_rate": 1.4210614828026786e-06, + "loss": 0.78224462, + "num_input_tokens_seen": 108594125, + "step": 5037, + "time_per_iteration": 2.7242977619171143 + }, + { + "auxiliary_loss_clip": 0.0113616, + "auxiliary_loss_mlp": 0.01084715, + "balance_loss_clip": 1.02572381, + "balance_loss_mlp": 1.004511, + "epoch": 0.6057836830397403, + "flos": 24789459294720.0, + "grad_norm": 1.5163343686757498, + "language_loss": 0.74248391, + "learning_rate": 1.4203159049573605e-06, + "loss": 0.76469266, + "num_input_tokens_seen": 108615360, + "step": 5038, + "time_per_iteration": 2.800703287124634 + }, + { + "auxiliary_loss_clip": 0.01103531, + "auxiliary_loss_mlp": 0.01083896, + "balance_loss_clip": 1.02598596, + "balance_loss_mlp": 1.00359654, + "epoch": 0.6059039259303793, + "flos": 20558248899840.0, + "grad_norm": 1.964858883053213, + "language_loss": 0.86574483, + "learning_rate": 1.4195704150496593e-06, + "loss": 0.88761914, + "num_input_tokens_seen": 108633075, + "step": 5039, + "time_per_iteration": 2.67085337638855 + }, + { + "auxiliary_loss_clip": 0.01112398, + "auxiliary_loss_mlp": 0.01086098, + "balance_loss_clip": 1.02547657, + "balance_loss_mlp": 1.0058465, + "epoch": 0.6060241688210185, + "flos": 21069724613760.0, + "grad_norm": 1.6658690612440645, + "language_loss": 0.73557174, + "learning_rate": 1.4188250131926639e-06, + "loss": 0.75755674, + "num_input_tokens_seen": 108651875, + "step": 5040, + "time_per_iteration": 2.7458419799804688 + }, + { + "auxiliary_loss_clip": 0.01119514, + "auxiliary_loss_mlp": 0.01085363, + "balance_loss_clip": 1.02630448, + "balance_loss_mlp": 1.00501621, + "epoch": 0.6061444117116576, + "flos": 16361081619840.0, + "grad_norm": 1.812282150979574, + "language_loss": 0.80555248, + "learning_rate": 1.4180796994994525e-06, + "loss": 0.82760125, + "num_input_tokens_seen": 108669290, + "step": 5041, + "time_per_iteration": 2.6699845790863037 + }, + { + "auxiliary_loss_clip": 0.01117251, + "auxiliary_loss_mlp": 0.01084345, + "balance_loss_clip": 1.02481377, + "balance_loss_mlp": 1.00409341, + "epoch": 0.6062646546022966, + "flos": 21507296094720.0, + "grad_norm": 1.676073892903213, + "language_loss": 0.71874237, + "learning_rate": 1.4173344740830877e-06, + "loss": 0.7407583, + "num_input_tokens_seen": 108688420, + "step": 5042, + "time_per_iteration": 2.7600433826446533 + }, + { + "auxiliary_loss_clip": 0.01113718, + "auxiliary_loss_mlp": 0.01085873, + "balance_loss_clip": 1.02275383, + "balance_loss_mlp": 1.00552559, + "epoch": 0.6063848974929358, + "flos": 38983151283840.0, + "grad_norm": 1.5057201126938204, + "language_loss": 0.70367718, + "learning_rate": 1.4165893370566206e-06, + "loss": 0.72567308, + "num_input_tokens_seen": 108712175, + "step": 5043, + "time_per_iteration": 2.8870034217834473 + }, + { + "auxiliary_loss_clip": 0.01129362, + "auxiliary_loss_mlp": 0.01084177, + "balance_loss_clip": 1.02669394, + "balance_loss_mlp": 1.00383019, + "epoch": 0.6065051403835748, + "flos": 19646584784640.0, + "grad_norm": 1.6823075510163927, + "language_loss": 0.77573407, + "learning_rate": 1.4158442885330865e-06, + "loss": 0.79786944, + "num_input_tokens_seen": 108730745, + "step": 5044, + "time_per_iteration": 2.7294976711273193 + }, + { + "auxiliary_loss_clip": 0.01128869, + "auxiliary_loss_mlp": 0.01084938, + "balance_loss_clip": 1.02657986, + "balance_loss_mlp": 1.00468588, + "epoch": 0.6066253832742139, + "flos": 23513086437120.0, + "grad_norm": 1.8893190968721012, + "language_loss": 0.783934, + "learning_rate": 1.4150993286255094e-06, + "loss": 0.806072, + "num_input_tokens_seen": 108749995, + "step": 5045, + "time_per_iteration": 2.700911283493042 + }, + { + "auxiliary_loss_clip": 0.01135528, + "auxiliary_loss_mlp": 0.01084342, + "balance_loss_clip": 1.02546299, + "balance_loss_mlp": 1.00399518, + "epoch": 0.6067456261648531, + "flos": 19133708440320.0, + "grad_norm": 1.9468365098448308, + "language_loss": 0.79712927, + "learning_rate": 1.4143544574468993e-06, + "loss": 0.81932795, + "num_input_tokens_seen": 108768355, + "step": 5046, + "time_per_iteration": 2.66506290435791 + }, + { + "auxiliary_loss_clip": 0.01124974, + "auxiliary_loss_mlp": 0.01083781, + "balance_loss_clip": 1.02456021, + "balance_loss_mlp": 1.00357676, + "epoch": 0.6068658690554921, + "flos": 20520614424960.0, + "grad_norm": 1.6478005661999033, + "language_loss": 0.82169884, + "learning_rate": 1.4136096751102523e-06, + "loss": 0.84378636, + "num_input_tokens_seen": 108786685, + "step": 5047, + "time_per_iteration": 3.537993907928467 + }, + { + "auxiliary_loss_clip": 0.01118218, + "auxiliary_loss_mlp": 0.01084482, + "balance_loss_clip": 1.02546847, + "balance_loss_mlp": 1.00418305, + "epoch": 0.6069861119461312, + "flos": 27374560185600.0, + "grad_norm": 1.9723725584880782, + "language_loss": 0.8302635, + "learning_rate": 1.4128649817285516e-06, + "loss": 0.85229051, + "num_input_tokens_seen": 108804820, + "step": 5048, + "time_per_iteration": 2.7320239543914795 + }, + { + "auxiliary_loss_clip": 0.01120139, + "auxiliary_loss_mlp": 0.01085801, + "balance_loss_clip": 1.02642393, + "balance_loss_mlp": 1.00554919, + "epoch": 0.6071063548367702, + "flos": 25626500904960.0, + "grad_norm": 1.667548563936335, + "language_loss": 0.63174063, + "learning_rate": 1.412120377414766e-06, + "loss": 0.65380001, + "num_input_tokens_seen": 108825010, + "step": 5049, + "time_per_iteration": 3.579826831817627 + }, + { + "auxiliary_loss_clip": 0.0113705, + "auxiliary_loss_mlp": 0.01084554, + "balance_loss_clip": 1.02710485, + "balance_loss_mlp": 1.0043025, + "epoch": 0.6072265977274094, + "flos": 24460517520000.0, + "grad_norm": 1.5440777789904423, + "language_loss": 0.71157897, + "learning_rate": 1.4113758622818522e-06, + "loss": 0.73379499, + "num_input_tokens_seen": 108845075, + "step": 5050, + "time_per_iteration": 3.6716666221618652 + }, + { + "auxiliary_loss_clip": 0.01120137, + "auxiliary_loss_mlp": 0.00872657, + "balance_loss_clip": 1.0265075, + "balance_loss_mlp": 1.00013423, + "epoch": 0.6073468406180484, + "flos": 18149253413760.0, + "grad_norm": 2.6694035079159875, + "language_loss": 0.83224189, + "learning_rate": 1.410631436442751e-06, + "loss": 0.85216987, + "num_input_tokens_seen": 108863870, + "step": 5051, + "time_per_iteration": 2.6537373065948486 + }, + { + "auxiliary_loss_clip": 0.01128488, + "auxiliary_loss_mlp": 0.01084989, + "balance_loss_clip": 1.02596891, + "balance_loss_mlp": 1.00464213, + "epoch": 0.6074670835086875, + "flos": 20697617669760.0, + "grad_norm": 2.2855250026649117, + "language_loss": 0.86631793, + "learning_rate": 1.4098871000103936e-06, + "loss": 0.88845265, + "num_input_tokens_seen": 108882470, + "step": 5052, + "time_per_iteration": 2.680901288986206 + }, + { + "auxiliary_loss_clip": 0.01103507, + "auxiliary_loss_mlp": 0.01084459, + "balance_loss_clip": 1.02605963, + "balance_loss_mlp": 1.00425518, + "epoch": 0.6075873263993267, + "flos": 23769955572480.0, + "grad_norm": 1.5497987542800606, + "language_loss": 0.82488763, + "learning_rate": 1.409142853097693e-06, + "loss": 0.84676731, + "num_input_tokens_seen": 108902710, + "step": 5053, + "time_per_iteration": 2.664097547531128 + }, + { + "auxiliary_loss_clip": 0.01116173, + "auxiliary_loss_mlp": 0.0108399, + "balance_loss_clip": 1.0239557, + "balance_loss_mlp": 1.00369072, + "epoch": 0.6077075692899657, + "flos": 24454484035200.0, + "grad_norm": 2.002531842059, + "language_loss": 0.79531604, + "learning_rate": 1.408398695817553e-06, + "loss": 0.81731772, + "num_input_tokens_seen": 108919935, + "step": 5054, + "time_per_iteration": 3.638596534729004 + }, + { + "auxiliary_loss_clip": 0.0111931, + "auxiliary_loss_mlp": 0.01084419, + "balance_loss_clip": 1.02556658, + "balance_loss_mlp": 1.00402427, + "epoch": 0.6078278121806048, + "flos": 27382102041600.0, + "grad_norm": 1.7323716739713955, + "language_loss": 0.69992626, + "learning_rate": 1.4076546282828593e-06, + "loss": 0.72196352, + "num_input_tokens_seen": 108942790, + "step": 5055, + "time_per_iteration": 2.759923219680786 + }, + { + "auxiliary_loss_clip": 0.011198, + "auxiliary_loss_mlp": 0.01085092, + "balance_loss_clip": 1.02491331, + "balance_loss_mlp": 1.00479281, + "epoch": 0.6079480550712439, + "flos": 38436447306240.0, + "grad_norm": 2.1574578670998643, + "language_loss": 0.6641587, + "learning_rate": 1.4069106506064874e-06, + "loss": 0.68620765, + "num_input_tokens_seen": 108964215, + "step": 5056, + "time_per_iteration": 2.837116003036499 + }, + { + "auxiliary_loss_clip": 0.01116199, + "auxiliary_loss_mlp": 0.01084739, + "balance_loss_clip": 1.02575767, + "balance_loss_mlp": 1.0045352, + "epoch": 0.608068297961883, + "flos": 25336271013120.0, + "grad_norm": 1.5947057902282658, + "language_loss": 0.78406453, + "learning_rate": 1.4061667629012989e-06, + "loss": 0.8060739, + "num_input_tokens_seen": 108984885, + "step": 5057, + "time_per_iteration": 2.741374969482422 + }, + { + "auxiliary_loss_clip": 0.01119331, + "auxiliary_loss_mlp": 0.01083707, + "balance_loss_clip": 1.02734363, + "balance_loss_mlp": 1.00345516, + "epoch": 0.608188540852522, + "flos": 24202463235840.0, + "grad_norm": 1.5416626200515167, + "language_loss": 0.83263117, + "learning_rate": 1.40542296528014e-06, + "loss": 0.85466158, + "num_input_tokens_seen": 109004545, + "step": 5058, + "time_per_iteration": 2.6828174591064453 + }, + { + "auxiliary_loss_clip": 0.01130608, + "auxiliary_loss_mlp": 0.01086027, + "balance_loss_clip": 1.02775109, + "balance_loss_mlp": 1.00567985, + "epoch": 0.6083087837431612, + "flos": 21284146851840.0, + "grad_norm": 2.2200154330082627, + "language_loss": 0.76043481, + "learning_rate": 1.4046792578558452e-06, + "loss": 0.78260112, + "num_input_tokens_seen": 109022440, + "step": 5059, + "time_per_iteration": 2.6411631107330322 + }, + { + "auxiliary_loss_clip": 0.01120631, + "auxiliary_loss_mlp": 0.01083495, + "balance_loss_clip": 1.02671885, + "balance_loss_mlp": 1.00333881, + "epoch": 0.6084290266338003, + "flos": 16471435178880.0, + "grad_norm": 2.309798968884677, + "language_loss": 0.76018959, + "learning_rate": 1.4039356407412325e-06, + "loss": 0.78223085, + "num_input_tokens_seen": 109035680, + "step": 5060, + "time_per_iteration": 2.6365253925323486 + }, + { + "auxiliary_loss_clip": 0.01109207, + "auxiliary_loss_mlp": 0.01078941, + "balance_loss_clip": 1.02127624, + "balance_loss_mlp": 0.99997675, + "epoch": 0.6085492695244393, + "flos": 66443574931200.0, + "grad_norm": 0.7818908056498413, + "language_loss": 0.57198572, + "learning_rate": 1.40319211404911e-06, + "loss": 0.59386718, + "num_input_tokens_seen": 109090680, + "step": 5061, + "time_per_iteration": 3.19246506690979 + }, + { + "auxiliary_loss_clip": 0.01136208, + "auxiliary_loss_mlp": 0.01083813, + "balance_loss_clip": 1.02609789, + "balance_loss_mlp": 1.00365639, + "epoch": 0.6086695124150785, + "flos": 23618986709760.0, + "grad_norm": 1.9831336476046157, + "language_loss": 0.90337288, + "learning_rate": 1.4024486778922691e-06, + "loss": 0.92557311, + "num_input_tokens_seen": 109108995, + "step": 5062, + "time_per_iteration": 2.638404607772827 + }, + { + "auxiliary_loss_clip": 0.0111783, + "auxiliary_loss_mlp": 0.01084512, + "balance_loss_clip": 1.02372217, + "balance_loss_mlp": 1.00421309, + "epoch": 0.6087897553057176, + "flos": 20157054917760.0, + "grad_norm": 2.107796467817298, + "language_loss": 0.7757107, + "learning_rate": 1.4017053323834884e-06, + "loss": 0.79773414, + "num_input_tokens_seen": 109128825, + "step": 5063, + "time_per_iteration": 2.6636874675750732 + }, + { + "auxiliary_loss_clip": 0.01118853, + "auxiliary_loss_mlp": 0.01084953, + "balance_loss_clip": 1.02515876, + "balance_loss_mlp": 1.00470102, + "epoch": 0.6089099981963566, + "flos": 25482535194240.0, + "grad_norm": 2.091247280239151, + "language_loss": 0.75964308, + "learning_rate": 1.4009620776355333e-06, + "loss": 0.78168112, + "num_input_tokens_seen": 109150425, + "step": 5064, + "time_per_iteration": 2.743475914001465 + }, + { + "auxiliary_loss_clip": 0.01128829, + "auxiliary_loss_mlp": 0.01085751, + "balance_loss_clip": 1.02583504, + "balance_loss_mlp": 1.00549936, + "epoch": 0.6090302410869958, + "flos": 25332895134720.0, + "grad_norm": 2.671141617965317, + "language_loss": 0.79003859, + "learning_rate": 1.4002189137611553e-06, + "loss": 0.81218433, + "num_input_tokens_seen": 109169765, + "step": 5065, + "time_per_iteration": 2.6468803882598877 + }, + { + "auxiliary_loss_clip": 0.01128173, + "auxiliary_loss_mlp": 0.01084005, + "balance_loss_clip": 1.02601779, + "balance_loss_mlp": 1.00380099, + "epoch": 0.6091504839776348, + "flos": 23987358639360.0, + "grad_norm": 1.5675866606354694, + "language_loss": 0.69393814, + "learning_rate": 1.3994758408730901e-06, + "loss": 0.71605992, + "num_input_tokens_seen": 109188950, + "step": 5066, + "time_per_iteration": 2.697761058807373 + }, + { + "auxiliary_loss_clip": 0.01117091, + "auxiliary_loss_mlp": 0.01083925, + "balance_loss_clip": 1.0252918, + "balance_loss_mlp": 1.0036732, + "epoch": 0.6092707268682739, + "flos": 29643037666560.0, + "grad_norm": 1.8770955937454057, + "language_loss": 0.76207751, + "learning_rate": 1.3987328590840629e-06, + "loss": 0.78408766, + "num_input_tokens_seen": 109209895, + "step": 5067, + "time_per_iteration": 2.7335102558135986 + }, + { + "auxiliary_loss_clip": 0.01128605, + "auxiliary_loss_mlp": 0.01085083, + "balance_loss_clip": 1.02580345, + "balance_loss_mlp": 1.0047363, + "epoch": 0.609390969758913, + "flos": 24024957200640.0, + "grad_norm": 1.9832505710854655, + "language_loss": 0.86194479, + "learning_rate": 1.397989968506783e-06, + "loss": 0.88408172, + "num_input_tokens_seen": 109228905, + "step": 5068, + "time_per_iteration": 2.7256674766540527 + }, + { + "auxiliary_loss_clip": 0.01138173, + "auxiliary_loss_mlp": 0.01085622, + "balance_loss_clip": 1.02715325, + "balance_loss_mlp": 1.00522709, + "epoch": 0.6095112126495521, + "flos": 11102143288320.0, + "grad_norm": 2.013190307515689, + "language_loss": 0.71861631, + "learning_rate": 1.3972471692539458e-06, + "loss": 0.74085426, + "num_input_tokens_seen": 109243620, + "step": 5069, + "time_per_iteration": 2.5826468467712402 + }, + { + "auxiliary_loss_clip": 0.01119053, + "auxiliary_loss_mlp": 0.01085636, + "balance_loss_clip": 1.02603984, + "balance_loss_mlp": 1.00538421, + "epoch": 0.6096314555401912, + "flos": 17265491187840.0, + "grad_norm": 1.7987743918752916, + "language_loss": 0.7508508, + "learning_rate": 1.3965044614382348e-06, + "loss": 0.77289772, + "num_input_tokens_seen": 109259070, + "step": 5070, + "time_per_iteration": 2.6558666229248047 + }, + { + "auxiliary_loss_clip": 0.01137298, + "auxiliary_loss_mlp": 0.01085052, + "balance_loss_clip": 1.02733064, + "balance_loss_mlp": 1.00465739, + "epoch": 0.6097516984308303, + "flos": 21645910679040.0, + "grad_norm": 4.139783842699765, + "language_loss": 0.75289375, + "learning_rate": 1.3957618451723162e-06, + "loss": 0.77511728, + "num_input_tokens_seen": 109275100, + "step": 5071, + "time_per_iteration": 2.5446696281433105 + }, + { + "auxiliary_loss_clip": 0.01118713, + "auxiliary_loss_mlp": 0.01085402, + "balance_loss_clip": 1.02577758, + "balance_loss_mlp": 1.00500727, + "epoch": 0.6098719413214694, + "flos": 27199208966400.0, + "grad_norm": 1.7356665605009063, + "language_loss": 0.71595663, + "learning_rate": 1.3950193205688457e-06, + "loss": 0.73799777, + "num_input_tokens_seen": 109294825, + "step": 5072, + "time_per_iteration": 3.5623645782470703 + }, + { + "auxiliary_loss_clip": 0.0110993, + "auxiliary_loss_mlp": 0.01084062, + "balance_loss_clip": 1.02322221, + "balance_loss_mlp": 1.00385749, + "epoch": 0.6099921842121084, + "flos": 20412954385920.0, + "grad_norm": 2.1334673090469036, + "language_loss": 0.83821207, + "learning_rate": 1.3942768877404627e-06, + "loss": 0.86015201, + "num_input_tokens_seen": 109313790, + "step": 5073, + "time_per_iteration": 2.688527822494507 + }, + { + "auxiliary_loss_clip": 0.01136496, + "auxiliary_loss_mlp": 0.01083481, + "balance_loss_clip": 1.02599311, + "balance_loss_mlp": 1.0031817, + "epoch": 0.6101124271027476, + "flos": 23366139897600.0, + "grad_norm": 1.4683354731605556, + "language_loss": 0.73676926, + "learning_rate": 1.393534546799795e-06, + "loss": 0.75896907, + "num_input_tokens_seen": 109333490, + "step": 5074, + "time_per_iteration": 2.5746588706970215 + }, + { + "auxiliary_loss_clip": 0.01119331, + "auxiliary_loss_mlp": 0.01084623, + "balance_loss_clip": 1.02649283, + "balance_loss_mlp": 1.0042758, + "epoch": 0.6102326699933867, + "flos": 26687840993280.0, + "grad_norm": 1.697930681622902, + "language_loss": 0.67670572, + "learning_rate": 1.3927922978594536e-06, + "loss": 0.69874525, + "num_input_tokens_seen": 109354575, + "step": 5075, + "time_per_iteration": 3.5272257328033447 + }, + { + "auxiliary_loss_clip": 0.01108048, + "auxiliary_loss_mlp": 0.01078847, + "balance_loss_clip": 1.02042329, + "balance_loss_mlp": 0.99988288, + "epoch": 0.6103529128840257, + "flos": 60644612551680.0, + "grad_norm": 0.7936232943692189, + "language_loss": 0.57439059, + "learning_rate": 1.3920501410320387e-06, + "loss": 0.59625959, + "num_input_tokens_seen": 109410690, + "step": 5076, + "time_per_iteration": 4.118930101394653 + }, + { + "auxiliary_loss_clip": 0.0111816, + "auxiliary_loss_mlp": 0.01084451, + "balance_loss_clip": 1.02479112, + "balance_loss_mlp": 1.00410366, + "epoch": 0.6104731557746649, + "flos": 19021307806080.0, + "grad_norm": 2.5160644353041826, + "language_loss": 0.76409352, + "learning_rate": 1.3913080764301333e-06, + "loss": 0.78611958, + "num_input_tokens_seen": 109427650, + "step": 5077, + "time_per_iteration": 2.6983754634857178 + }, + { + "auxiliary_loss_clip": 0.01102462, + "auxiliary_loss_mlp": 0.01084076, + "balance_loss_clip": 1.0244894, + "balance_loss_mlp": 1.003824, + "epoch": 0.6105933986653039, + "flos": 23366894083200.0, + "grad_norm": 1.7210208900073019, + "language_loss": 0.71542287, + "learning_rate": 1.3905661041663085e-06, + "loss": 0.73728824, + "num_input_tokens_seen": 109448835, + "step": 5078, + "time_per_iteration": 2.877687931060791 + }, + { + "auxiliary_loss_clip": 0.01126666, + "auxiliary_loss_mlp": 0.01084892, + "balance_loss_clip": 1.02503824, + "balance_loss_mlp": 1.0044024, + "epoch": 0.610713641555943, + "flos": 34637565006720.0, + "grad_norm": 2.1583575579488308, + "language_loss": 0.64940751, + "learning_rate": 1.389824224353122e-06, + "loss": 0.67152309, + "num_input_tokens_seen": 109470425, + "step": 5079, + "time_per_iteration": 3.7112185955047607 + }, + { + "auxiliary_loss_clip": 0.01126997, + "auxiliary_loss_mlp": 0.01084851, + "balance_loss_clip": 1.02603126, + "balance_loss_mlp": 1.00459957, + "epoch": 0.610833884446582, + "flos": 26646471504000.0, + "grad_norm": 1.5598203916135813, + "language_loss": 0.76933748, + "learning_rate": 1.389082437103115e-06, + "loss": 0.79145598, + "num_input_tokens_seen": 109489695, + "step": 5080, + "time_per_iteration": 2.681694507598877 + }, + { + "auxiliary_loss_clip": 0.0110449, + "auxiliary_loss_mlp": 0.01084775, + "balance_loss_clip": 1.0245682, + "balance_loss_mlp": 1.00438046, + "epoch": 0.6109541273372212, + "flos": 21215126868480.0, + "grad_norm": 1.6844311866903436, + "language_loss": 0.77762294, + "learning_rate": 1.3883407425288172e-06, + "loss": 0.79951555, + "num_input_tokens_seen": 109510030, + "step": 5081, + "time_per_iteration": 2.719768762588501 + }, + { + "auxiliary_loss_clip": 0.01119413, + "auxiliary_loss_mlp": 0.01084416, + "balance_loss_clip": 1.02562129, + "balance_loss_mlp": 1.00416422, + "epoch": 0.6110743702278603, + "flos": 20084084438400.0, + "grad_norm": 2.116083214951366, + "language_loss": 0.79921758, + "learning_rate": 1.3875991407427417e-06, + "loss": 0.82125592, + "num_input_tokens_seen": 109528255, + "step": 5082, + "time_per_iteration": 2.6582655906677246 + }, + { + "auxiliary_loss_clip": 0.01093417, + "auxiliary_loss_mlp": 0.01079018, + "balance_loss_clip": 1.02112746, + "balance_loss_mlp": 1.00005341, + "epoch": 0.6111946131184993, + "flos": 68302957438080.0, + "grad_norm": 0.7667579232027498, + "language_loss": 0.58188963, + "learning_rate": 1.38685763185739e-06, + "loss": 0.60361397, + "num_input_tokens_seen": 109581915, + "step": 5083, + "time_per_iteration": 3.3263626098632812 + }, + { + "auxiliary_loss_clip": 0.01135808, + "auxiliary_loss_mlp": 0.0108636, + "balance_loss_clip": 1.02594781, + "balance_loss_mlp": 1.00615561, + "epoch": 0.6113148560091385, + "flos": 19937676602880.0, + "grad_norm": 2.5531623839477535, + "language_loss": 0.67779541, + "learning_rate": 1.3861162159852476e-06, + "loss": 0.70001704, + "num_input_tokens_seen": 109600050, + "step": 5084, + "time_per_iteration": 2.541638135910034 + }, + { + "auxiliary_loss_clip": 0.01118882, + "auxiliary_loss_mlp": 0.01086531, + "balance_loss_clip": 1.02562344, + "balance_loss_mlp": 1.0060885, + "epoch": 0.6114350988997775, + "flos": 23731854220800.0, + "grad_norm": 1.7709587139693932, + "language_loss": 0.79706299, + "learning_rate": 1.3853748932387875e-06, + "loss": 0.81911719, + "num_input_tokens_seen": 109620690, + "step": 5085, + "time_per_iteration": 2.683499813079834 + }, + { + "auxiliary_loss_clip": 0.0111113, + "auxiliary_loss_mlp": 0.01086628, + "balance_loss_clip": 1.02368617, + "balance_loss_mlp": 1.00613749, + "epoch": 0.6115553417904166, + "flos": 24023700224640.0, + "grad_norm": 3.1699059700081116, + "language_loss": 0.75026208, + "learning_rate": 1.3846336637304671e-06, + "loss": 0.77223963, + "num_input_tokens_seen": 109638960, + "step": 5086, + "time_per_iteration": 2.676231861114502 + }, + { + "auxiliary_loss_clip": 0.01113983, + "auxiliary_loss_mlp": 0.01085937, + "balance_loss_clip": 1.02229357, + "balance_loss_mlp": 1.00568557, + "epoch": 0.6116755846810558, + "flos": 23733542160000.0, + "grad_norm": 1.7809550137401264, + "language_loss": 0.8290965, + "learning_rate": 1.3838925275727316e-06, + "loss": 0.85109568, + "num_input_tokens_seen": 109659700, + "step": 5087, + "time_per_iteration": 2.7029197216033936 + }, + { + "auxiliary_loss_clip": 0.01137603, + "auxiliary_loss_mlp": 0.01084576, + "balance_loss_clip": 1.02742767, + "balance_loss_mlp": 1.00437164, + "epoch": 0.6117958275716948, + "flos": 18661626967680.0, + "grad_norm": 1.7317832539489053, + "language_loss": 0.7909556, + "learning_rate": 1.3831514848780089e-06, + "loss": 0.81317741, + "num_input_tokens_seen": 109679275, + "step": 5088, + "time_per_iteration": 2.5439815521240234 + }, + { + "auxiliary_loss_clip": 0.01129318, + "auxiliary_loss_mlp": 0.01084983, + "balance_loss_clip": 1.02709365, + "balance_loss_mlp": 1.0047313, + "epoch": 0.6119160704623339, + "flos": 16471183783680.0, + "grad_norm": 2.609044985424887, + "language_loss": 0.92380875, + "learning_rate": 1.3824105357587152e-06, + "loss": 0.9459517, + "num_input_tokens_seen": 109696380, + "step": 5089, + "time_per_iteration": 2.6357357501983643 + }, + { + "auxiliary_loss_clip": 0.01118233, + "auxiliary_loss_mlp": 0.0108373, + "balance_loss_clip": 1.02506864, + "balance_loss_mlp": 1.00343037, + "epoch": 0.612036313352973, + "flos": 23915465568000.0, + "grad_norm": 1.4531098426867537, + "language_loss": 0.8258813, + "learning_rate": 1.381669680327253e-06, + "loss": 0.84790099, + "num_input_tokens_seen": 109718060, + "step": 5090, + "time_per_iteration": 2.723227024078369 + }, + { + "auxiliary_loss_clip": 0.0112016, + "auxiliary_loss_mlp": 0.01084597, + "balance_loss_clip": 1.02705288, + "balance_loss_mlp": 1.00429773, + "epoch": 0.6121565562436121, + "flos": 26974766833920.0, + "grad_norm": 1.8850107004764811, + "language_loss": 0.71287322, + "learning_rate": 1.380928918696008e-06, + "loss": 0.73492086, + "num_input_tokens_seen": 109736830, + "step": 5091, + "time_per_iteration": 2.693011999130249 + }, + { + "auxiliary_loss_clip": 0.0112672, + "auxiliary_loss_mlp": 0.01084596, + "balance_loss_clip": 1.02536345, + "balance_loss_mlp": 1.00420129, + "epoch": 0.6122767991342511, + "flos": 15668867646720.0, + "grad_norm": 3.0349535631434774, + "language_loss": 0.71620053, + "learning_rate": 1.3801882509773548e-06, + "loss": 0.73831367, + "num_input_tokens_seen": 109754690, + "step": 5092, + "time_per_iteration": 2.681518077850342 + }, + { + "auxiliary_loss_clip": 0.01128097, + "auxiliary_loss_mlp": 0.01084208, + "balance_loss_clip": 1.02560735, + "balance_loss_mlp": 1.00395656, + "epoch": 0.6123970420248903, + "flos": 27964321591680.0, + "grad_norm": 1.8425844400471765, + "language_loss": 0.81631577, + "learning_rate": 1.3794476772836503e-06, + "loss": 0.83843887, + "num_input_tokens_seen": 109775790, + "step": 5093, + "time_per_iteration": 2.654291868209839 + }, + { + "auxiliary_loss_clip": 0.01103254, + "auxiliary_loss_mlp": 0.01083868, + "balance_loss_clip": 1.02119255, + "balance_loss_mlp": 1.00342536, + "epoch": 0.6125172849155294, + "flos": 21468727866240.0, + "grad_norm": 1.539107672693431, + "language_loss": 0.84331858, + "learning_rate": 1.3787071977272402e-06, + "loss": 0.86518979, + "num_input_tokens_seen": 109795050, + "step": 5094, + "time_per_iteration": 2.7342865467071533 + }, + { + "auxiliary_loss_clip": 0.01089175, + "auxiliary_loss_mlp": 0.01084449, + "balance_loss_clip": 1.02130961, + "balance_loss_mlp": 1.00410223, + "epoch": 0.6126375278061684, + "flos": 16248321849600.0, + "grad_norm": 2.5214528178843523, + "language_loss": 0.72467709, + "learning_rate": 1.3779668124204535e-06, + "loss": 0.74641335, + "num_input_tokens_seen": 109811465, + "step": 5095, + "time_per_iteration": 2.7274460792541504 + }, + { + "auxiliary_loss_clip": 0.0111745, + "auxiliary_loss_mlp": 0.01084424, + "balance_loss_clip": 1.0255903, + "balance_loss_mlp": 1.00426745, + "epoch": 0.6127577706968076, + "flos": 20448865008000.0, + "grad_norm": 1.5735582848489296, + "language_loss": 0.80763733, + "learning_rate": 1.3772265214756074e-06, + "loss": 0.82965612, + "num_input_tokens_seen": 109831225, + "step": 5096, + "time_per_iteration": 2.7128546237945557 + }, + { + "auxiliary_loss_clip": 0.01128076, + "auxiliary_loss_mlp": 0.01084577, + "balance_loss_clip": 1.02529216, + "balance_loss_mlp": 1.00432551, + "epoch": 0.6128780135874466, + "flos": 18260397072000.0, + "grad_norm": 1.6541555267456851, + "language_loss": 0.75379133, + "learning_rate": 1.3764863250050025e-06, + "loss": 0.77591783, + "num_input_tokens_seen": 109849465, + "step": 5097, + "time_per_iteration": 2.5830225944519043 + }, + { + "auxiliary_loss_clip": 0.01110182, + "auxiliary_loss_mlp": 0.01084733, + "balance_loss_clip": 1.02508926, + "balance_loss_mlp": 1.00452876, + "epoch": 0.6129982564780857, + "flos": 24937088192640.0, + "grad_norm": 1.9277677246334877, + "language_loss": 0.80461037, + "learning_rate": 1.3757462231209272e-06, + "loss": 0.82655954, + "num_input_tokens_seen": 109869770, + "step": 5098, + "time_per_iteration": 3.6431775093078613 + }, + { + "auxiliary_loss_clip": 0.01118329, + "auxiliary_loss_mlp": 0.01085065, + "balance_loss_clip": 1.02575362, + "balance_loss_mlp": 1.00471795, + "epoch": 0.6131184993687249, + "flos": 22492038430080.0, + "grad_norm": 1.9439565432193366, + "language_loss": 0.88854074, + "learning_rate": 1.3750062159356525e-06, + "loss": 0.91057467, + "num_input_tokens_seen": 109889120, + "step": 5099, + "time_per_iteration": 2.7389330863952637 + }, + { + "auxiliary_loss_clip": 0.01106683, + "auxiliary_loss_mlp": 0.01084782, + "balance_loss_clip": 1.02384782, + "balance_loss_mlp": 1.00452995, + "epoch": 0.6132387422593639, + "flos": 15885839750400.0, + "grad_norm": 2.4338874292004986, + "language_loss": 0.83180112, + "learning_rate": 1.3742663035614382e-06, + "loss": 0.85371578, + "num_input_tokens_seen": 109906490, + "step": 5100, + "time_per_iteration": 3.6251232624053955 + }, + { + "auxiliary_loss_clip": 0.01135954, + "auxiliary_loss_mlp": 0.01084622, + "balance_loss_clip": 1.02586234, + "balance_loss_mlp": 1.00437033, + "epoch": 0.613358985150003, + "flos": 25411539962880.0, + "grad_norm": 1.731766640255707, + "language_loss": 0.79646862, + "learning_rate": 1.3735264861105283e-06, + "loss": 0.81867439, + "num_input_tokens_seen": 109927130, + "step": 5101, + "time_per_iteration": 3.6458635330200195 + }, + { + "auxiliary_loss_clip": 0.01108444, + "auxiliary_loss_mlp": 0.01084533, + "balance_loss_clip": 1.02482224, + "balance_loss_mlp": 1.00437665, + "epoch": 0.6134792280406421, + "flos": 21361283308800.0, + "grad_norm": 1.845049514110292, + "language_loss": 0.78467613, + "learning_rate": 1.372786763695152e-06, + "loss": 0.80660594, + "num_input_tokens_seen": 109945890, + "step": 5102, + "time_per_iteration": 2.7289364337921143 + }, + { + "auxiliary_loss_clip": 0.01128322, + "auxiliary_loss_mlp": 0.01083981, + "balance_loss_clip": 1.02583766, + "balance_loss_mlp": 1.00368154, + "epoch": 0.6135994709312812, + "flos": 21211248199680.0, + "grad_norm": 2.1350226972739232, + "language_loss": 0.77459198, + "learning_rate": 1.3720471364275257e-06, + "loss": 0.79671502, + "num_input_tokens_seen": 109965535, + "step": 5103, + "time_per_iteration": 2.630059242248535 + }, + { + "auxiliary_loss_clip": 0.01108242, + "auxiliary_loss_mlp": 0.0087297, + "balance_loss_clip": 1.02314425, + "balance_loss_mlp": 1.00005496, + "epoch": 0.6137197138219203, + "flos": 14794047907200.0, + "grad_norm": 1.7860653522384127, + "language_loss": 0.77454692, + "learning_rate": 1.3713076044198486e-06, + "loss": 0.79435909, + "num_input_tokens_seen": 109982345, + "step": 5104, + "time_per_iteration": 3.57777738571167 + }, + { + "auxiliary_loss_clip": 0.0111934, + "auxiliary_loss_mlp": 0.01084837, + "balance_loss_clip": 1.02618361, + "balance_loss_mlp": 1.0044899, + "epoch": 0.6138399567125594, + "flos": 20084515401600.0, + "grad_norm": 2.025703228919513, + "language_loss": 0.80570251, + "learning_rate": 1.3705681677843086e-06, + "loss": 0.82774425, + "num_input_tokens_seen": 110000940, + "step": 5105, + "time_per_iteration": 2.698070526123047 + }, + { + "auxiliary_loss_clip": 0.01117556, + "auxiliary_loss_mlp": 0.01078695, + "balance_loss_clip": 1.02157366, + "balance_loss_mlp": 0.99973029, + "epoch": 0.6139601996031985, + "flos": 60123838193280.0, + "grad_norm": 0.7712038447536611, + "language_loss": 0.60649168, + "learning_rate": 1.3698288266330768e-06, + "loss": 0.62845421, + "num_input_tokens_seen": 110061565, + "step": 5106, + "time_per_iteration": 3.2765109539031982 + }, + { + "auxiliary_loss_clip": 0.01110895, + "auxiliary_loss_mlp": 0.0108491, + "balance_loss_clip": 1.02482665, + "balance_loss_mlp": 1.00465822, + "epoch": 0.6140804424938375, + "flos": 23586703361280.0, + "grad_norm": 2.215704133500448, + "language_loss": 0.72798753, + "learning_rate": 1.3690895810783113e-06, + "loss": 0.74994552, + "num_input_tokens_seen": 110080360, + "step": 5107, + "time_per_iteration": 2.6728694438934326 + }, + { + "auxiliary_loss_clip": 0.01075826, + "auxiliary_loss_mlp": 0.00872945, + "balance_loss_clip": 1.02344108, + "balance_loss_mlp": 1.00003314, + "epoch": 0.6142006853844767, + "flos": 21398199511680.0, + "grad_norm": 1.8983847008070471, + "language_loss": 0.71273601, + "learning_rate": 1.3683504312321543e-06, + "loss": 0.73222369, + "num_input_tokens_seen": 110100695, + "step": 5108, + "time_per_iteration": 2.8024775981903076 + }, + { + "auxiliary_loss_clip": 0.01128552, + "auxiliary_loss_mlp": 0.01084312, + "balance_loss_clip": 1.0263195, + "balance_loss_mlp": 1.00401258, + "epoch": 0.6143209282751158, + "flos": 12057367622400.0, + "grad_norm": 2.1652787054705795, + "language_loss": 0.80229336, + "learning_rate": 1.3676113772067355e-06, + "loss": 0.824422, + "num_input_tokens_seen": 110117750, + "step": 5109, + "time_per_iteration": 2.6626546382904053 + }, + { + "auxiliary_loss_clip": 0.01098499, + "auxiliary_loss_mlp": 0.01084863, + "balance_loss_clip": 1.02291167, + "balance_loss_mlp": 1.00456321, + "epoch": 0.6144411711657548, + "flos": 25082274965760.0, + "grad_norm": 1.938125489610334, + "language_loss": 0.72713339, + "learning_rate": 1.3668724191141671e-06, + "loss": 0.74896699, + "num_input_tokens_seen": 110137020, + "step": 5110, + "time_per_iteration": 2.751178503036499 + }, + { + "auxiliary_loss_clip": 0.0110303, + "auxiliary_loss_mlp": 0.0108486, + "balance_loss_clip": 1.02203906, + "balance_loss_mlp": 1.00465631, + "epoch": 0.6145614140563939, + "flos": 20114069316480.0, + "grad_norm": 2.3718579911210824, + "language_loss": 0.66231406, + "learning_rate": 1.3661335570665493e-06, + "loss": 0.68419296, + "num_input_tokens_seen": 110154930, + "step": 5111, + "time_per_iteration": 2.7948055267333984 + }, + { + "auxiliary_loss_clip": 0.01119131, + "auxiliary_loss_mlp": 0.01084025, + "balance_loss_clip": 1.02633238, + "balance_loss_mlp": 1.00377345, + "epoch": 0.614681656947033, + "flos": 16800376953600.0, + "grad_norm": 2.214022307930453, + "language_loss": 0.69475156, + "learning_rate": 1.3653947911759676e-06, + "loss": 0.71678317, + "num_input_tokens_seen": 110172480, + "step": 5112, + "time_per_iteration": 2.6272056102752686 + }, + { + "auxiliary_loss_clip": 0.01100363, + "auxiliary_loss_mlp": 0.01087625, + "balance_loss_clip": 1.02433848, + "balance_loss_mlp": 1.00723052, + "epoch": 0.6148018998376721, + "flos": 38801587011840.0, + "grad_norm": 1.7033480770424312, + "language_loss": 0.74366581, + "learning_rate": 1.3646561215544904e-06, + "loss": 0.76554573, + "num_input_tokens_seen": 110197120, + "step": 5113, + "time_per_iteration": 2.922881603240967 + }, + { + "auxiliary_loss_clip": 0.01125784, + "auxiliary_loss_mlp": 0.01085303, + "balance_loss_clip": 1.02530146, + "balance_loss_mlp": 1.00490856, + "epoch": 0.6149221427283111, + "flos": 23327032965120.0, + "grad_norm": 2.0085954161465396, + "language_loss": 0.79547548, + "learning_rate": 1.363917548314176e-06, + "loss": 0.81758636, + "num_input_tokens_seen": 110216385, + "step": 5114, + "time_per_iteration": 2.6385550498962402 + }, + { + "auxiliary_loss_clip": 0.01112659, + "auxiliary_loss_mlp": 0.01083849, + "balance_loss_clip": 1.02642632, + "balance_loss_mlp": 1.00350165, + "epoch": 0.6150423856189503, + "flos": 22379494141440.0, + "grad_norm": 1.7109885312412054, + "language_loss": 0.72853631, + "learning_rate": 1.3631790715670626e-06, + "loss": 0.75050145, + "num_input_tokens_seen": 110234790, + "step": 5115, + "time_per_iteration": 2.626904010772705 + }, + { + "auxiliary_loss_clip": 0.01077241, + "auxiliary_loss_mlp": 0.01084448, + "balance_loss_clip": 1.0213449, + "balance_loss_mlp": 1.00424373, + "epoch": 0.6151626285095894, + "flos": 18692078722560.0, + "grad_norm": 1.790227787917327, + "language_loss": 0.85584444, + "learning_rate": 1.3624406914251783e-06, + "loss": 0.87746131, + "num_input_tokens_seen": 110251910, + "step": 5116, + "time_per_iteration": 2.8692786693573 + }, + { + "auxiliary_loss_clip": 0.01128671, + "auxiliary_loss_mlp": 0.01083714, + "balance_loss_clip": 1.02630043, + "balance_loss_mlp": 1.00350964, + "epoch": 0.6152828714002284, + "flos": 15851688894720.0, + "grad_norm": 1.8246082094142053, + "language_loss": 0.88473815, + "learning_rate": 1.3617024080005335e-06, + "loss": 0.90686202, + "num_input_tokens_seen": 110268810, + "step": 5117, + "time_per_iteration": 2.653141975402832 + }, + { + "auxiliary_loss_clip": 0.0110104, + "auxiliary_loss_mlp": 0.00872951, + "balance_loss_clip": 1.02454865, + "balance_loss_mlp": 1.00008583, + "epoch": 0.6154031142908676, + "flos": 24869792062080.0, + "grad_norm": 1.51032271022384, + "language_loss": 0.74139875, + "learning_rate": 1.3609642214051266e-06, + "loss": 0.76113862, + "num_input_tokens_seen": 110293035, + "step": 5118, + "time_per_iteration": 2.7633144855499268 + }, + { + "auxiliary_loss_clip": 0.01114952, + "auxiliary_loss_mlp": 0.0108515, + "balance_loss_clip": 1.02357638, + "balance_loss_mlp": 1.00485086, + "epoch": 0.6155233571815066, + "flos": 19244744357760.0, + "grad_norm": 1.7821666786818675, + "language_loss": 0.65810966, + "learning_rate": 1.3602261317509385e-06, + "loss": 0.68011063, + "num_input_tokens_seen": 110309695, + "step": 5119, + "time_per_iteration": 2.632852554321289 + }, + { + "auxiliary_loss_clip": 0.01127012, + "auxiliary_loss_mlp": 0.01084653, + "balance_loss_clip": 1.02588296, + "balance_loss_mlp": 1.0043056, + "epoch": 0.6156436000721457, + "flos": 18770077105920.0, + "grad_norm": 2.404615429844067, + "language_loss": 0.82332546, + "learning_rate": 1.3594881391499387e-06, + "loss": 0.84544212, + "num_input_tokens_seen": 110328610, + "step": 5120, + "time_per_iteration": 2.6697964668273926 + }, + { + "auxiliary_loss_clip": 0.01119598, + "auxiliary_loss_mlp": 0.01084845, + "balance_loss_clip": 1.02655113, + "balance_loss_mlp": 1.00459373, + "epoch": 0.6157638429627849, + "flos": 18041198325120.0, + "grad_norm": 1.63184034034001, + "language_loss": 0.79229808, + "learning_rate": 1.3587502437140778e-06, + "loss": 0.8143425, + "num_input_tokens_seen": 110346775, + "step": 5121, + "time_per_iteration": 2.696286201477051 + }, + { + "auxiliary_loss_clip": 0.01117692, + "auxiliary_loss_mlp": 0.0108582, + "balance_loss_clip": 1.02457833, + "balance_loss_mlp": 1.00547278, + "epoch": 0.6158840858534239, + "flos": 25556726736000.0, + "grad_norm": 2.2476113009911893, + "language_loss": 0.84868383, + "learning_rate": 1.3580124455552952e-06, + "loss": 0.87071902, + "num_input_tokens_seen": 110366140, + "step": 5122, + "time_per_iteration": 2.6800122261047363 + }, + { + "auxiliary_loss_clip": 0.01128419, + "auxiliary_loss_mlp": 0.00872945, + "balance_loss_clip": 1.02696848, + "balance_loss_mlp": 1.00010455, + "epoch": 0.616004328744063, + "flos": 24640788902400.0, + "grad_norm": 1.8832928806304339, + "language_loss": 0.87476981, + "learning_rate": 1.3572747447855148e-06, + "loss": 0.8947835, + "num_input_tokens_seen": 110386550, + "step": 5123, + "time_per_iteration": 3.543339252471924 + }, + { + "auxiliary_loss_clip": 0.0113829, + "auxiliary_loss_mlp": 0.01083757, + "balance_loss_clip": 1.02808976, + "balance_loss_mlp": 1.00340986, + "epoch": 0.6161245716347021, + "flos": 21689686379520.0, + "grad_norm": 1.921888545744872, + "language_loss": 0.69283104, + "learning_rate": 1.356537141516644e-06, + "loss": 0.71505153, + "num_input_tokens_seen": 110403970, + "step": 5124, + "time_per_iteration": 2.594895839691162 + }, + { + "auxiliary_loss_clip": 0.01126762, + "auxiliary_loss_mlp": 0.01085584, + "balance_loss_clip": 1.02580714, + "balance_loss_mlp": 1.00533259, + "epoch": 0.6162448145253412, + "flos": 35189225061120.0, + "grad_norm": 1.7409078329891328, + "language_loss": 0.61547399, + "learning_rate": 1.3557996358605775e-06, + "loss": 0.63759744, + "num_input_tokens_seen": 110423890, + "step": 5125, + "time_per_iteration": 2.6881980895996094 + }, + { + "auxiliary_loss_clip": 0.01127087, + "auxiliary_loss_mlp": 0.01084569, + "balance_loss_clip": 1.02514517, + "balance_loss_mlp": 1.00436509, + "epoch": 0.6163650574159802, + "flos": 21615279356160.0, + "grad_norm": 1.9622490272662558, + "language_loss": 0.69816113, + "learning_rate": 1.3550622279291941e-06, + "loss": 0.72027767, + "num_input_tokens_seen": 110442035, + "step": 5126, + "time_per_iteration": 3.5619091987609863 + }, + { + "auxiliary_loss_clip": 0.01091942, + "auxiliary_loss_mlp": 0.01084747, + "balance_loss_clip": 1.02267742, + "balance_loss_mlp": 1.0044477, + "epoch": 0.6164853003066194, + "flos": 24572163968640.0, + "grad_norm": 1.3334094286844933, + "language_loss": 0.83099103, + "learning_rate": 1.354324917834358e-06, + "loss": 0.85275787, + "num_input_tokens_seen": 110463280, + "step": 5127, + "time_per_iteration": 3.742682456970215 + }, + { + "auxiliary_loss_clip": 0.0108352, + "auxiliary_loss_mlp": 0.00872934, + "balance_loss_clip": 1.01916575, + "balance_loss_mlp": 1.00014067, + "epoch": 0.6166055431972585, + "flos": 21835986474240.0, + "grad_norm": 1.7377012119940627, + "language_loss": 0.77127445, + "learning_rate": 1.353587705687918e-06, + "loss": 0.79083902, + "num_input_tokens_seen": 110481455, + "step": 5128, + "time_per_iteration": 2.7730729579925537 + }, + { + "auxiliary_loss_clip": 0.0112052, + "auxiliary_loss_mlp": 0.01085169, + "balance_loss_clip": 1.02668548, + "balance_loss_mlp": 1.00477409, + "epoch": 0.6167257860878975, + "flos": 17785262943360.0, + "grad_norm": 2.9632591556145997, + "language_loss": 0.72119784, + "learning_rate": 1.3528505916017096e-06, + "loss": 0.74325466, + "num_input_tokens_seen": 110499155, + "step": 5129, + "time_per_iteration": 3.5876317024230957 + }, + { + "auxiliary_loss_clip": 0.01128898, + "auxiliary_loss_mlp": 0.01084622, + "balance_loss_clip": 1.02598488, + "balance_loss_mlp": 1.0042274, + "epoch": 0.6168460289785367, + "flos": 23214811898880.0, + "grad_norm": 1.9802358332916592, + "language_loss": 0.88367271, + "learning_rate": 1.3521135756875514e-06, + "loss": 0.90580785, + "num_input_tokens_seen": 110515470, + "step": 5130, + "time_per_iteration": 2.6814844608306885 + }, + { + "auxiliary_loss_clip": 0.01082937, + "auxiliary_loss_mlp": 0.01083843, + "balance_loss_clip": 1.02241969, + "balance_loss_mlp": 1.00373423, + "epoch": 0.6169662718691757, + "flos": 26213281482240.0, + "grad_norm": 1.4488247742550138, + "language_loss": 0.86222529, + "learning_rate": 1.3513766580572496e-06, + "loss": 0.88389307, + "num_input_tokens_seen": 110538290, + "step": 5131, + "time_per_iteration": 2.812267780303955 + }, + { + "auxiliary_loss_clip": 0.01126365, + "auxiliary_loss_mlp": 0.01084121, + "balance_loss_clip": 1.02514243, + "balance_loss_mlp": 1.0039649, + "epoch": 0.6170865147598148, + "flos": 19026120228480.0, + "grad_norm": 1.8937192545162724, + "language_loss": 0.77212238, + "learning_rate": 1.3506398388225924e-06, + "loss": 0.79422724, + "num_input_tokens_seen": 110555610, + "step": 5132, + "time_per_iteration": 2.6661603450775146 + }, + { + "auxiliary_loss_clip": 0.01137095, + "auxiliary_loss_mlp": 0.01084283, + "balance_loss_clip": 1.02709913, + "balance_loss_mlp": 1.00412631, + "epoch": 0.617206757650454, + "flos": 18260361158400.0, + "grad_norm": 1.7293266142768757, + "language_loss": 0.71864843, + "learning_rate": 1.349903118095355e-06, + "loss": 0.74086213, + "num_input_tokens_seen": 110574745, + "step": 5133, + "time_per_iteration": 2.613255262374878 + }, + { + "auxiliary_loss_clip": 0.01129714, + "auxiliary_loss_mlp": 0.01084426, + "balance_loss_clip": 1.02677023, + "balance_loss_mlp": 1.00412655, + "epoch": 0.617327000541093, + "flos": 18186959715840.0, + "grad_norm": 1.7718504966732413, + "language_loss": 0.73359358, + "learning_rate": 1.349166495987298e-06, + "loss": 0.75573492, + "num_input_tokens_seen": 110593310, + "step": 5134, + "time_per_iteration": 2.618461847305298 + }, + { + "auxiliary_loss_clip": 0.01093635, + "auxiliary_loss_mlp": 0.01079588, + "balance_loss_clip": 1.01428974, + "balance_loss_mlp": 1.00062394, + "epoch": 0.6174472434317321, + "flos": 61833796122240.0, + "grad_norm": 0.8185715369424416, + "language_loss": 0.61070418, + "learning_rate": 1.348429972610166e-06, + "loss": 0.63243639, + "num_input_tokens_seen": 110657615, + "step": 5135, + "time_per_iteration": 3.338198184967041 + }, + { + "auxiliary_loss_clip": 0.01077224, + "auxiliary_loss_mlp": 0.01079551, + "balance_loss_clip": 1.01499844, + "balance_loss_mlp": 1.00058639, + "epoch": 0.6175674863223712, + "flos": 71230970494080.0, + "grad_norm": 0.8456135242792839, + "language_loss": 0.57914424, + "learning_rate": 1.3476935480756897e-06, + "loss": 0.600712, + "num_input_tokens_seen": 110714365, + "step": 5136, + "time_per_iteration": 3.1798791885375977 + }, + { + "auxiliary_loss_clip": 0.01110935, + "auxiliary_loss_mlp": 0.01084904, + "balance_loss_clip": 1.0248667, + "balance_loss_mlp": 1.00450945, + "epoch": 0.6176877292130103, + "flos": 21835447770240.0, + "grad_norm": 3.847130325643129, + "language_loss": 0.75340366, + "learning_rate": 1.346957222495583e-06, + "loss": 0.77536201, + "num_input_tokens_seen": 110732160, + "step": 5137, + "time_per_iteration": 2.708141803741455 + }, + { + "auxiliary_loss_clip": 0.01105318, + "auxiliary_loss_mlp": 0.00872896, + "balance_loss_clip": 1.02842712, + "balance_loss_mlp": 1.00009763, + "epoch": 0.6178079721036493, + "flos": 17741738638080.0, + "grad_norm": 2.3167059357118216, + "language_loss": 0.71053922, + "learning_rate": 1.3462209959815466e-06, + "loss": 0.73032129, + "num_input_tokens_seen": 110746900, + "step": 5138, + "time_per_iteration": 2.6189403533935547 + }, + { + "auxiliary_loss_clip": 0.0112011, + "auxiliary_loss_mlp": 0.01086553, + "balance_loss_clip": 1.02698505, + "balance_loss_mlp": 1.00639713, + "epoch": 0.6179282149942885, + "flos": 22633131052800.0, + "grad_norm": 1.7838723995893788, + "language_loss": 0.73992085, + "learning_rate": 1.345484868645265e-06, + "loss": 0.76198745, + "num_input_tokens_seen": 110765710, + "step": 5139, + "time_per_iteration": 2.6935229301452637 + }, + { + "auxiliary_loss_clip": 0.01093885, + "auxiliary_loss_mlp": 0.01084962, + "balance_loss_clip": 1.02523947, + "balance_loss_mlp": 1.00456774, + "epoch": 0.6180484578849276, + "flos": 22310330503680.0, + "grad_norm": 3.1224101652761997, + "language_loss": 0.78454888, + "learning_rate": 1.3447488405984088e-06, + "loss": 0.80633736, + "num_input_tokens_seen": 110783970, + "step": 5140, + "time_per_iteration": 2.6896157264709473 + }, + { + "auxiliary_loss_clip": 0.0111655, + "auxiliary_loss_mlp": 0.01085284, + "balance_loss_clip": 1.02521181, + "balance_loss_mlp": 1.00488925, + "epoch": 0.6181687007755666, + "flos": 35225458905600.0, + "grad_norm": 2.184896602255757, + "language_loss": 0.70183587, + "learning_rate": 1.3440129119526322e-06, + "loss": 0.72385424, + "num_input_tokens_seen": 110806395, + "step": 5141, + "time_per_iteration": 2.7994959354400635 + }, + { + "auxiliary_loss_clip": 0.01117172, + "auxiliary_loss_mlp": 0.01078885, + "balance_loss_clip": 1.02112603, + "balance_loss_mlp": 0.99992102, + "epoch": 0.6182889436662057, + "flos": 61547370094080.0, + "grad_norm": 0.8036389705621539, + "language_loss": 0.51204014, + "learning_rate": 1.3432770828195762e-06, + "loss": 0.53400075, + "num_input_tokens_seen": 110867380, + "step": 5142, + "time_per_iteration": 3.3413443565368652 + }, + { + "auxiliary_loss_clip": 0.01104054, + "auxiliary_loss_mlp": 0.01084181, + "balance_loss_clip": 1.02421856, + "balance_loss_mlp": 1.00383377, + "epoch": 0.6184091865568448, + "flos": 19609991804160.0, + "grad_norm": 2.088641968036028, + "language_loss": 0.70177937, + "learning_rate": 1.3425413533108635e-06, + "loss": 0.72366178, + "num_input_tokens_seen": 110885980, + "step": 5143, + "time_per_iteration": 2.7057101726531982 + }, + { + "auxiliary_loss_clip": 0.01097907, + "auxiliary_loss_mlp": 0.01086467, + "balance_loss_clip": 1.0241313, + "balance_loss_mlp": 1.00611973, + "epoch": 0.6185294294474839, + "flos": 23586882929280.0, + "grad_norm": 3.6566343322895087, + "language_loss": 0.70470005, + "learning_rate": 1.341805723538105e-06, + "loss": 0.72654378, + "num_input_tokens_seen": 110906085, + "step": 5144, + "time_per_iteration": 2.807004451751709 + }, + { + "auxiliary_loss_clip": 0.01103532, + "auxiliary_loss_mlp": 0.01083834, + "balance_loss_clip": 1.026016, + "balance_loss_mlp": 1.00363064, + "epoch": 0.618649672338123, + "flos": 26762032535040.0, + "grad_norm": 1.7885261148775715, + "language_loss": 0.77392447, + "learning_rate": 1.3410701936128948e-06, + "loss": 0.79579812, + "num_input_tokens_seen": 110928865, + "step": 5145, + "time_per_iteration": 2.7617969512939453 + }, + { + "auxiliary_loss_clip": 0.01128454, + "auxiliary_loss_mlp": 0.01083369, + "balance_loss_clip": 1.02729678, + "balance_loss_mlp": 1.00321245, + "epoch": 0.6187699152287621, + "flos": 14456630522880.0, + "grad_norm": 2.304811383230995, + "language_loss": 0.84418857, + "learning_rate": 1.340334763646812e-06, + "loss": 0.86630678, + "num_input_tokens_seen": 110943000, + "step": 5146, + "time_per_iteration": 2.624359369277954 + }, + { + "auxiliary_loss_clip": 0.01137778, + "auxiliary_loss_mlp": 0.0108512, + "balance_loss_clip": 1.02737045, + "balance_loss_mlp": 1.00482082, + "epoch": 0.6188901581194012, + "flos": 20084766796800.0, + "grad_norm": 1.580008399256573, + "language_loss": 0.74524403, + "learning_rate": 1.3395994337514218e-06, + "loss": 0.76747304, + "num_input_tokens_seen": 110963170, + "step": 5147, + "time_per_iteration": 2.5862069129943848 + }, + { + "auxiliary_loss_clip": 0.01127645, + "auxiliary_loss_mlp": 0.01084025, + "balance_loss_clip": 1.02584088, + "balance_loss_mlp": 1.00372589, + "epoch": 0.6190104010100402, + "flos": 25700728360320.0, + "grad_norm": 1.5059524159405304, + "language_loss": 0.78774154, + "learning_rate": 1.3388642040382725e-06, + "loss": 0.80985832, + "num_input_tokens_seen": 110983595, + "step": 5148, + "time_per_iteration": 3.595148801803589 + }, + { + "auxiliary_loss_clip": 0.01111953, + "auxiliary_loss_mlp": 0.01086176, + "balance_loss_clip": 1.0258652, + "balance_loss_mlp": 1.00587642, + "epoch": 0.6191306439006794, + "flos": 30442372974720.0, + "grad_norm": 1.595867296075748, + "language_loss": 0.84214902, + "learning_rate": 1.3381290746188975e-06, + "loss": 0.86413032, + "num_input_tokens_seen": 111002965, + "step": 5149, + "time_per_iteration": 2.766981363296509 + }, + { + "auxiliary_loss_clip": 0.01127006, + "auxiliary_loss_mlp": 0.01085275, + "balance_loss_clip": 1.02633524, + "balance_loss_mlp": 1.00492799, + "epoch": 0.6192508867913185, + "flos": 26685793918080.0, + "grad_norm": 2.671048984447587, + "language_loss": 0.6733095, + "learning_rate": 1.3373940456048152e-06, + "loss": 0.69543236, + "num_input_tokens_seen": 111022990, + "step": 5150, + "time_per_iteration": 2.7109317779541016 + }, + { + "auxiliary_loss_clip": 0.01136065, + "auxiliary_loss_mlp": 0.01084403, + "balance_loss_clip": 1.02573752, + "balance_loss_mlp": 1.00415123, + "epoch": 0.6193711296819575, + "flos": 36722036090880.0, + "grad_norm": 1.6117526155142432, + "language_loss": 0.58790845, + "learning_rate": 1.3366591171075299e-06, + "loss": 0.61011314, + "num_input_tokens_seen": 111046495, + "step": 5151, + "time_per_iteration": 3.6633105278015137 + }, + { + "auxiliary_loss_clip": 0.0111601, + "auxiliary_loss_mlp": 0.01083947, + "balance_loss_clip": 1.02492535, + "balance_loss_mlp": 1.00369501, + "epoch": 0.6194913725725967, + "flos": 25192556697600.0, + "grad_norm": 1.8068926566033285, + "language_loss": 0.90997815, + "learning_rate": 1.335924289238529e-06, + "loss": 0.93197775, + "num_input_tokens_seen": 111065705, + "step": 5152, + "time_per_iteration": 2.7208914756774902 + }, + { + "auxiliary_loss_clip": 0.01123196, + "auxiliary_loss_mlp": 0.00873008, + "balance_loss_clip": 1.02367342, + "balance_loss_mlp": 1.00010824, + "epoch": 0.6196116154632357, + "flos": 21178821196800.0, + "grad_norm": 1.7016603243669042, + "language_loss": 0.77018207, + "learning_rate": 1.3351895621092859e-06, + "loss": 0.79014409, + "num_input_tokens_seen": 111086050, + "step": 5153, + "time_per_iteration": 3.592820882797241 + }, + { + "auxiliary_loss_clip": 0.01070166, + "auxiliary_loss_mlp": 0.01084103, + "balance_loss_clip": 1.01918149, + "balance_loss_mlp": 1.00380385, + "epoch": 0.6197318583538748, + "flos": 16253744803200.0, + "grad_norm": 1.9522129141998743, + "language_loss": 0.764943, + "learning_rate": 1.3344549358312567e-06, + "loss": 0.78648561, + "num_input_tokens_seen": 111104450, + "step": 5154, + "time_per_iteration": 2.9276020526885986 + }, + { + "auxiliary_loss_clip": 0.01129072, + "auxiliary_loss_mlp": 0.01084388, + "balance_loss_clip": 1.02695775, + "balance_loss_mlp": 1.00408876, + "epoch": 0.619852101244514, + "flos": 24425612478720.0, + "grad_norm": 1.8102599501319978, + "language_loss": 0.78182888, + "learning_rate": 1.3337204105158852e-06, + "loss": 0.80396342, + "num_input_tokens_seen": 111123320, + "step": 5155, + "time_per_iteration": 3.615259885787964 + }, + { + "auxiliary_loss_clip": 0.01109644, + "auxiliary_loss_mlp": 0.01083773, + "balance_loss_clip": 1.02319384, + "balance_loss_mlp": 1.00342607, + "epoch": 0.619972344135153, + "flos": 16727298733440.0, + "grad_norm": 1.886450943474437, + "language_loss": 0.72568619, + "learning_rate": 1.332985986274597e-06, + "loss": 0.74762034, + "num_input_tokens_seen": 111140950, + "step": 5156, + "time_per_iteration": 2.7223973274230957 + }, + { + "auxiliary_loss_clip": 0.01083638, + "auxiliary_loss_mlp": 0.00872788, + "balance_loss_clip": 1.02337277, + "balance_loss_mlp": 1.0001508, + "epoch": 0.6200925870257921, + "flos": 12495190498560.0, + "grad_norm": 2.0607774036295696, + "language_loss": 0.75240099, + "learning_rate": 1.3322516632188047e-06, + "loss": 0.77196527, + "num_input_tokens_seen": 111157845, + "step": 5157, + "time_per_iteration": 2.815563201904297 + }, + { + "auxiliary_loss_clip": 0.0110826, + "auxiliary_loss_mlp": 0.01086507, + "balance_loss_clip": 1.02493489, + "balance_loss_mlp": 1.00611186, + "epoch": 0.6202128299164312, + "flos": 26539350168960.0, + "grad_norm": 1.861130862712954, + "language_loss": 0.66957295, + "learning_rate": 1.3315174414599045e-06, + "loss": 0.69152069, + "num_input_tokens_seen": 111179165, + "step": 5158, + "time_per_iteration": 2.875572681427002 + }, + { + "auxiliary_loss_clip": 0.01128134, + "auxiliary_loss_mlp": 0.01084646, + "balance_loss_clip": 1.0258441, + "balance_loss_mlp": 1.00429952, + "epoch": 0.6203330728070703, + "flos": 18770508069120.0, + "grad_norm": 1.65410042097441, + "language_loss": 0.75082207, + "learning_rate": 1.3307833211092768e-06, + "loss": 0.77294981, + "num_input_tokens_seen": 111197830, + "step": 5159, + "time_per_iteration": 2.744954824447632 + }, + { + "auxiliary_loss_clip": 0.01138862, + "auxiliary_loss_mlp": 0.01084955, + "balance_loss_clip": 1.02865839, + "balance_loss_mlp": 1.0047034, + "epoch": 0.6204533156977093, + "flos": 20629782835200.0, + "grad_norm": 1.5334425727901104, + "language_loss": 0.75268686, + "learning_rate": 1.3300493022782873e-06, + "loss": 0.77492499, + "num_input_tokens_seen": 111218400, + "step": 5160, + "time_per_iteration": 2.6896846294403076 + }, + { + "auxiliary_loss_clip": 0.01096461, + "auxiliary_loss_mlp": 0.00873011, + "balance_loss_clip": 1.02224731, + "balance_loss_mlp": 1.00007153, + "epoch": 0.6205735585883485, + "flos": 17348050598400.0, + "grad_norm": 1.7584955009382182, + "language_loss": 0.72452533, + "learning_rate": 1.3293153850782855e-06, + "loss": 0.74422002, + "num_input_tokens_seen": 111236720, + "step": 5161, + "time_per_iteration": 2.774564743041992 + }, + { + "auxiliary_loss_clip": 0.01106702, + "auxiliary_loss_mlp": 0.01084479, + "balance_loss_clip": 1.02259851, + "balance_loss_mlp": 1.00408435, + "epoch": 0.6206938014789876, + "flos": 22965017742720.0, + "grad_norm": 1.7618378388440827, + "language_loss": 0.71270114, + "learning_rate": 1.3285815696206069e-06, + "loss": 0.73461294, + "num_input_tokens_seen": 111258265, + "step": 5162, + "time_per_iteration": 2.7433929443359375 + }, + { + "auxiliary_loss_clip": 0.01107457, + "auxiliary_loss_mlp": 0.01084456, + "balance_loss_clip": 1.02302217, + "balance_loss_mlp": 1.00406098, + "epoch": 0.6208140443696266, + "flos": 23983192661760.0, + "grad_norm": 2.06981631802234, + "language_loss": 0.76418227, + "learning_rate": 1.32784785601657e-06, + "loss": 0.7861014, + "num_input_tokens_seen": 111277675, + "step": 5163, + "time_per_iteration": 2.8354556560516357 + }, + { + "auxiliary_loss_clip": 0.01119148, + "auxiliary_loss_mlp": 0.01084149, + "balance_loss_clip": 1.02523375, + "balance_loss_mlp": 1.00389695, + "epoch": 0.6209342872602658, + "flos": 35077291303680.0, + "grad_norm": 1.5850321208317764, + "language_loss": 0.73809254, + "learning_rate": 1.3271142443774798e-06, + "loss": 0.76012552, + "num_input_tokens_seen": 111299910, + "step": 5164, + "time_per_iteration": 2.795701503753662 + }, + { + "auxiliary_loss_clip": 0.01115834, + "auxiliary_loss_mlp": 0.01084288, + "balance_loss_clip": 1.02449095, + "balance_loss_mlp": 1.00413132, + "epoch": 0.6210545301509048, + "flos": 26979327861120.0, + "grad_norm": 1.7911342353030932, + "language_loss": 0.8123492, + "learning_rate": 1.3263807348146228e-06, + "loss": 0.83435047, + "num_input_tokens_seen": 111319765, + "step": 5165, + "time_per_iteration": 2.785764455795288 + }, + { + "auxiliary_loss_clip": 0.01119657, + "auxiliary_loss_mlp": 0.01085213, + "balance_loss_clip": 1.02545226, + "balance_loss_mlp": 1.00481844, + "epoch": 0.6211747730415439, + "flos": 33618240852480.0, + "grad_norm": 1.8365446762921813, + "language_loss": 0.73369157, + "learning_rate": 1.3256473274392733e-06, + "loss": 0.75574028, + "num_input_tokens_seen": 111341110, + "step": 5166, + "time_per_iteration": 2.7742526531219482 + }, + { + "auxiliary_loss_clip": 0.01137389, + "auxiliary_loss_mlp": 0.01084747, + "balance_loss_clip": 1.02721918, + "balance_loss_mlp": 1.00435233, + "epoch": 0.6212950159321831, + "flos": 34167099646080.0, + "grad_norm": 1.725577593377591, + "language_loss": 0.7011435, + "learning_rate": 1.3249140223626873e-06, + "loss": 0.72336483, + "num_input_tokens_seen": 111362730, + "step": 5167, + "time_per_iteration": 2.730062246322632 + }, + { + "auxiliary_loss_clip": 0.01124745, + "auxiliary_loss_mlp": 0.01083599, + "balance_loss_clip": 1.02468085, + "balance_loss_mlp": 1.00339484, + "epoch": 0.6214152588228221, + "flos": 27965758135680.0, + "grad_norm": 1.551202532573341, + "language_loss": 0.75318336, + "learning_rate": 1.3241808196961077e-06, + "loss": 0.77526677, + "num_input_tokens_seen": 111383855, + "step": 5168, + "time_per_iteration": 2.750258207321167 + }, + { + "auxiliary_loss_clip": 0.01117388, + "auxiliary_loss_mlp": 0.01084417, + "balance_loss_clip": 1.02499294, + "balance_loss_mlp": 1.0042603, + "epoch": 0.6215355017134612, + "flos": 20230204965120.0, + "grad_norm": 1.6717488270636918, + "language_loss": 0.70628232, + "learning_rate": 1.3234477195507608e-06, + "loss": 0.72830039, + "num_input_tokens_seen": 111402685, + "step": 5169, + "time_per_iteration": 2.705129384994507 + }, + { + "auxiliary_loss_clip": 0.01106465, + "auxiliary_loss_mlp": 0.01083944, + "balance_loss_clip": 1.02322721, + "balance_loss_mlp": 1.00373971, + "epoch": 0.6216557446041003, + "flos": 41428129219200.0, + "grad_norm": 2.145971253357296, + "language_loss": 0.62315226, + "learning_rate": 1.322714722037857e-06, + "loss": 0.64505637, + "num_input_tokens_seen": 111424130, + "step": 5170, + "time_per_iteration": 2.9303817749023438 + }, + { + "auxiliary_loss_clip": 0.01109805, + "auxiliary_loss_mlp": 0.01086256, + "balance_loss_clip": 1.02375245, + "balance_loss_mlp": 1.00586104, + "epoch": 0.6217759874947394, + "flos": 27928770105600.0, + "grad_norm": 2.029602043923772, + "language_loss": 0.77460802, + "learning_rate": 1.321981827268591e-06, + "loss": 0.79656863, + "num_input_tokens_seen": 111444785, + "step": 5171, + "time_per_iteration": 2.776756525039673 + }, + { + "auxiliary_loss_clip": 0.01117309, + "auxiliary_loss_mlp": 0.01083474, + "balance_loss_clip": 1.0238955, + "balance_loss_mlp": 1.00327051, + "epoch": 0.6218962303853784, + "flos": 21765673601280.0, + "grad_norm": 1.7046906312424206, + "language_loss": 0.81306475, + "learning_rate": 1.3212490353541426e-06, + "loss": 0.83507258, + "num_input_tokens_seen": 111467045, + "step": 5172, + "time_per_iteration": 2.762083053588867 + }, + { + "auxiliary_loss_clip": 0.01137475, + "auxiliary_loss_mlp": 0.01084742, + "balance_loss_clip": 1.02676046, + "balance_loss_mlp": 1.00439477, + "epoch": 0.6220164732760175, + "flos": 21246260981760.0, + "grad_norm": 2.1177847442420927, + "language_loss": 0.79904318, + "learning_rate": 1.3205163464056762e-06, + "loss": 0.82126528, + "num_input_tokens_seen": 111483650, + "step": 5173, + "time_per_iteration": 3.5843169689178467 + }, + { + "auxiliary_loss_clip": 0.01124704, + "auxiliary_loss_mlp": 0.01084844, + "balance_loss_clip": 1.02391708, + "balance_loss_mlp": 1.00468791, + "epoch": 0.6221367161666567, + "flos": 26136360506880.0, + "grad_norm": 1.7560621416765922, + "language_loss": 0.72423047, + "learning_rate": 1.319783760534339e-06, + "loss": 0.74632597, + "num_input_tokens_seen": 111502895, + "step": 5174, + "time_per_iteration": 2.697521686553955 + }, + { + "auxiliary_loss_clip": 0.01126836, + "auxiliary_loss_mlp": 0.01084428, + "balance_loss_clip": 1.02586365, + "balance_loss_mlp": 1.00417614, + "epoch": 0.6222569590572957, + "flos": 16284196558080.0, + "grad_norm": 1.9845729340517795, + "language_loss": 0.755862, + "learning_rate": 1.319051277851266e-06, + "loss": 0.77797461, + "num_input_tokens_seen": 111519180, + "step": 5175, + "time_per_iteration": 2.6275436878204346 + }, + { + "auxiliary_loss_clip": 0.01111577, + "auxiliary_loss_mlp": 0.01084082, + "balance_loss_clip": 1.02636147, + "balance_loss_mlp": 1.00383067, + "epoch": 0.6223772019479348, + "flos": 18223840005120.0, + "grad_norm": 1.814715422865343, + "language_loss": 0.84197801, + "learning_rate": 1.3183188984675716e-06, + "loss": 0.86393464, + "num_input_tokens_seen": 111537545, + "step": 5176, + "time_per_iteration": 2.63301420211792 + }, + { + "auxiliary_loss_clip": 0.01115723, + "auxiliary_loss_mlp": 0.01084066, + "balance_loss_clip": 1.02473402, + "balance_loss_mlp": 1.00376678, + "epoch": 0.6224974448385739, + "flos": 27489797994240.0, + "grad_norm": 3.9216750425487095, + "language_loss": 0.71290958, + "learning_rate": 1.3175866224943586e-06, + "loss": 0.73490739, + "num_input_tokens_seen": 111556265, + "step": 5177, + "time_per_iteration": 3.6345746517181396 + }, + { + "auxiliary_loss_clip": 0.01118381, + "auxiliary_loss_mlp": 0.01085808, + "balance_loss_clip": 1.02531552, + "balance_loss_mlp": 1.00546122, + "epoch": 0.622617687729213, + "flos": 19791951125760.0, + "grad_norm": 3.0961043765818044, + "language_loss": 0.73961604, + "learning_rate": 1.316854450042712e-06, + "loss": 0.76165789, + "num_input_tokens_seen": 111574205, + "step": 5178, + "time_per_iteration": 3.7161712646484375 + }, + { + "auxiliary_loss_clip": 0.01129383, + "auxiliary_loss_mlp": 0.01083746, + "balance_loss_clip": 1.02665758, + "balance_loss_mlp": 1.00344706, + "epoch": 0.622737930619852, + "flos": 23038886062080.0, + "grad_norm": 2.0651486465673785, + "language_loss": 0.74350715, + "learning_rate": 1.3161223812237024e-06, + "loss": 0.76563841, + "num_input_tokens_seen": 111593560, + "step": 5179, + "time_per_iteration": 2.644103527069092 + }, + { + "auxiliary_loss_clip": 0.01134911, + "auxiliary_loss_mlp": 0.01083951, + "balance_loss_clip": 1.02440798, + "balance_loss_mlp": 1.00369918, + "epoch": 0.6228581735104912, + "flos": 12634271959680.0, + "grad_norm": 2.129453293789534, + "language_loss": 0.85322678, + "learning_rate": 1.3153904161483842e-06, + "loss": 0.87541533, + "num_input_tokens_seen": 111608860, + "step": 5180, + "time_per_iteration": 2.555767059326172 + }, + { + "auxiliary_loss_clip": 0.01108698, + "auxiliary_loss_mlp": 0.01084461, + "balance_loss_clip": 1.02364933, + "balance_loss_mlp": 1.0042088, + "epoch": 0.6229784164011303, + "flos": 23802813538560.0, + "grad_norm": 1.8567301746681681, + "language_loss": 0.85103989, + "learning_rate": 1.3146585549277953e-06, + "loss": 0.87297148, + "num_input_tokens_seen": 111627500, + "step": 5181, + "time_per_iteration": 3.692584991455078 + }, + { + "auxiliary_loss_clip": 0.01104177, + "auxiliary_loss_mlp": 0.01086352, + "balance_loss_clip": 1.02619863, + "balance_loss_mlp": 1.00610077, + "epoch": 0.6230986592917693, + "flos": 22414219614720.0, + "grad_norm": 2.025059597615247, + "language_loss": 0.7835021, + "learning_rate": 1.3139267976729591e-06, + "loss": 0.8054074, + "num_input_tokens_seen": 111647690, + "step": 5182, + "time_per_iteration": 2.7015161514282227 + }, + { + "auxiliary_loss_clip": 0.01127712, + "auxiliary_loss_mlp": 0.0108372, + "balance_loss_clip": 1.02584207, + "balance_loss_mlp": 1.0035156, + "epoch": 0.6232189021824085, + "flos": 34528217028480.0, + "grad_norm": 1.6056377962843165, + "language_loss": 0.71783864, + "learning_rate": 1.3131951444948815e-06, + "loss": 0.73995298, + "num_input_tokens_seen": 111667090, + "step": 5183, + "time_per_iteration": 2.71871018409729 + }, + { + "auxiliary_loss_clip": 0.01118339, + "auxiliary_loss_mlp": 0.01085235, + "balance_loss_clip": 1.0259831, + "balance_loss_mlp": 1.00498366, + "epoch": 0.6233391450730476, + "flos": 22237000888320.0, + "grad_norm": 1.9418103914105869, + "language_loss": 0.76207608, + "learning_rate": 1.3124635955045546e-06, + "loss": 0.7841118, + "num_input_tokens_seen": 111686905, + "step": 5184, + "time_per_iteration": 2.843686103820801 + }, + { + "auxiliary_loss_clip": 0.01083937, + "auxiliary_loss_mlp": 0.0087289, + "balance_loss_clip": 1.02351439, + "balance_loss_mlp": 1.00008142, + "epoch": 0.6234593879636866, + "flos": 20332693445760.0, + "grad_norm": 1.7357018998795195, + "language_loss": 0.84337378, + "learning_rate": 1.3117321508129537e-06, + "loss": 0.8629421, + "num_input_tokens_seen": 111704985, + "step": 5185, + "time_per_iteration": 2.780081033706665 + }, + { + "auxiliary_loss_clip": 0.0111857, + "auxiliary_loss_mlp": 0.01084318, + "balance_loss_clip": 1.02577853, + "balance_loss_mlp": 1.0039711, + "epoch": 0.6235796308543258, + "flos": 20664903358080.0, + "grad_norm": 1.4900631437782221, + "language_loss": 0.76722479, + "learning_rate": 1.3110008105310388e-06, + "loss": 0.78925371, + "num_input_tokens_seen": 111724805, + "step": 5186, + "time_per_iteration": 2.706195831298828 + }, + { + "auxiliary_loss_clip": 0.01135687, + "auxiliary_loss_mlp": 0.01084134, + "balance_loss_clip": 1.02534676, + "balance_loss_mlp": 1.0037396, + "epoch": 0.6236998737449648, + "flos": 26618641441920.0, + "grad_norm": 1.5505335121817125, + "language_loss": 0.77828848, + "learning_rate": 1.3102695747697526e-06, + "loss": 0.80048674, + "num_input_tokens_seen": 111747675, + "step": 5187, + "time_per_iteration": 2.7421650886535645 + }, + { + "auxiliary_loss_clip": 0.01081529, + "auxiliary_loss_mlp": 0.01084538, + "balance_loss_clip": 1.02157593, + "balance_loss_mlp": 1.0041908, + "epoch": 0.6238201166356039, + "flos": 12674599954560.0, + "grad_norm": 2.0799132941749714, + "language_loss": 0.90637529, + "learning_rate": 1.3095384436400237e-06, + "loss": 0.92803591, + "num_input_tokens_seen": 111759205, + "step": 5188, + "time_per_iteration": 2.8082265853881836 + }, + { + "auxiliary_loss_clip": 0.01102915, + "auxiliary_loss_mlp": 0.0108434, + "balance_loss_clip": 1.02541125, + "balance_loss_mlp": 1.00404036, + "epoch": 0.623940359526243, + "flos": 10452160730880.0, + "grad_norm": 1.9227989974065245, + "language_loss": 0.82367408, + "learning_rate": 1.3088074172527633e-06, + "loss": 0.84554666, + "num_input_tokens_seen": 111776335, + "step": 5189, + "time_per_iteration": 2.647469997406006 + }, + { + "auxiliary_loss_clip": 0.01115719, + "auxiliary_loss_mlp": 0.01083325, + "balance_loss_clip": 1.02267647, + "balance_loss_mlp": 1.0031209, + "epoch": 0.6240606024168821, + "flos": 29059525226880.0, + "grad_norm": 1.8942753828474768, + "language_loss": 0.7151264, + "learning_rate": 1.3080764957188684e-06, + "loss": 0.73711681, + "num_input_tokens_seen": 111796580, + "step": 5190, + "time_per_iteration": 2.752794027328491 + }, + { + "auxiliary_loss_clip": 0.01086586, + "auxiliary_loss_mlp": 0.01084619, + "balance_loss_clip": 1.02480817, + "balance_loss_mlp": 1.00427246, + "epoch": 0.6241808453075212, + "flos": 22018089450240.0, + "grad_norm": 1.7876120831168194, + "language_loss": 0.70674992, + "learning_rate": 1.3073456791492192e-06, + "loss": 0.72846198, + "num_input_tokens_seen": 111816290, + "step": 5191, + "time_per_iteration": 2.818401575088501 + }, + { + "auxiliary_loss_clip": 0.0111793, + "auxiliary_loss_mlp": 0.01083745, + "balance_loss_clip": 1.02483702, + "balance_loss_mlp": 1.0035888, + "epoch": 0.6243010881981603, + "flos": 21138708683520.0, + "grad_norm": 1.8037218286670025, + "language_loss": 0.78036338, + "learning_rate": 1.3066149676546801e-06, + "loss": 0.8023802, + "num_input_tokens_seen": 111834470, + "step": 5192, + "time_per_iteration": 2.6429879665374756 + }, + { + "auxiliary_loss_clip": 0.01112911, + "auxiliary_loss_mlp": 0.01084238, + "balance_loss_clip": 1.02253544, + "balance_loss_mlp": 1.00408173, + "epoch": 0.6244213310887994, + "flos": 22344948236160.0, + "grad_norm": 1.6286263659504983, + "language_loss": 0.66194141, + "learning_rate": 1.3058843613460985e-06, + "loss": 0.68391287, + "num_input_tokens_seen": 111852410, + "step": 5193, + "time_per_iteration": 2.760495662689209 + }, + { + "auxiliary_loss_clip": 0.01094461, + "auxiliary_loss_mlp": 0.010847, + "balance_loss_clip": 1.02499676, + "balance_loss_mlp": 1.00440049, + "epoch": 0.6245415739794384, + "flos": 15231978524160.0, + "grad_norm": 1.7303976663604757, + "language_loss": 0.74375737, + "learning_rate": 1.3051538603343075e-06, + "loss": 0.765549, + "num_input_tokens_seen": 111870340, + "step": 5194, + "time_per_iteration": 2.7051546573638916 + }, + { + "auxiliary_loss_clip": 0.01125736, + "auxiliary_loss_mlp": 0.01084264, + "balance_loss_clip": 1.0248704, + "balance_loss_mlp": 1.00410819, + "epoch": 0.6246618168700776, + "flos": 18879891960960.0, + "grad_norm": 1.8188115287676925, + "language_loss": 0.68108177, + "learning_rate": 1.3044234647301235e-06, + "loss": 0.7031818, + "num_input_tokens_seen": 111888365, + "step": 5195, + "time_per_iteration": 2.6937055587768555 + }, + { + "auxiliary_loss_clip": 0.01125212, + "auxiliary_loss_mlp": 0.01086068, + "balance_loss_clip": 1.02471018, + "balance_loss_mlp": 1.00591159, + "epoch": 0.6247820597607167, + "flos": 14319201087360.0, + "grad_norm": 1.6845809621053756, + "language_loss": 0.72213233, + "learning_rate": 1.303693174644347e-06, + "loss": 0.74424505, + "num_input_tokens_seen": 111905840, + "step": 5196, + "time_per_iteration": 2.6587846279144287 + }, + { + "auxiliary_loss_clip": 0.01117478, + "auxiliary_loss_mlp": 0.01083981, + "balance_loss_clip": 1.02441299, + "balance_loss_mlp": 1.00372887, + "epoch": 0.6249023026513557, + "flos": 22637979388800.0, + "grad_norm": 1.9761668979564473, + "language_loss": 0.80329812, + "learning_rate": 1.3029629901877625e-06, + "loss": 0.82531273, + "num_input_tokens_seen": 111925215, + "step": 5197, + "time_per_iteration": 2.7147161960601807 + }, + { + "auxiliary_loss_clip": 0.01113714, + "auxiliary_loss_mlp": 0.01084972, + "balance_loss_clip": 1.02777886, + "balance_loss_mlp": 1.00462472, + "epoch": 0.6250225455419949, + "flos": 20266690204800.0, + "grad_norm": 2.504994319642213, + "language_loss": 0.76965344, + "learning_rate": 1.3022329114711376e-06, + "loss": 0.79164028, + "num_input_tokens_seen": 111943925, + "step": 5198, + "time_per_iteration": 2.65852952003479 + }, + { + "auxiliary_loss_clip": 0.01118057, + "auxiliary_loss_mlp": 0.01084503, + "balance_loss_clip": 1.02622008, + "balance_loss_mlp": 1.00425172, + "epoch": 0.6251427884326339, + "flos": 23437853400960.0, + "grad_norm": 2.9294324341779445, + "language_loss": 0.69574118, + "learning_rate": 1.3015029386052256e-06, + "loss": 0.71776676, + "num_input_tokens_seen": 111964095, + "step": 5199, + "time_per_iteration": 3.639827251434326 + }, + { + "auxiliary_loss_clip": 0.01093616, + "auxiliary_loss_mlp": 0.01084399, + "balance_loss_clip": 1.0242722, + "balance_loss_mlp": 1.00409937, + "epoch": 0.625263031323273, + "flos": 31723055464320.0, + "grad_norm": 1.825035925877611, + "language_loss": 0.72695005, + "learning_rate": 1.3007730717007622e-06, + "loss": 0.74873024, + "num_input_tokens_seen": 111984910, + "step": 5200, + "time_per_iteration": 2.8002099990844727 + }, + { + "auxiliary_loss_clip": 0.01137249, + "auxiliary_loss_mlp": 0.01083401, + "balance_loss_clip": 1.02661967, + "balance_loss_mlp": 1.00305438, + "epoch": 0.6253832742139122, + "flos": 24134341092480.0, + "grad_norm": 1.8766080282618425, + "language_loss": 0.75251162, + "learning_rate": 1.3000433108684676e-06, + "loss": 0.77471817, + "num_input_tokens_seen": 112005410, + "step": 5201, + "time_per_iteration": 2.5767369270324707 + }, + { + "auxiliary_loss_clip": 0.0112529, + "auxiliary_loss_mlp": 0.01084947, + "balance_loss_clip": 1.02444541, + "balance_loss_mlp": 1.00455189, + "epoch": 0.6255035171045512, + "flos": 27668812400640.0, + "grad_norm": 2.337203670402948, + "language_loss": 0.80331993, + "learning_rate": 1.2993136562190467e-06, + "loss": 0.82542229, + "num_input_tokens_seen": 112024530, + "step": 5202, + "time_per_iteration": 2.739626407623291 + }, + { + "auxiliary_loss_clip": 0.0111926, + "auxiliary_loss_mlp": 0.01084973, + "balance_loss_clip": 1.02630424, + "balance_loss_mlp": 1.0048641, + "epoch": 0.6256237599951903, + "flos": 20227798753920.0, + "grad_norm": 1.4884043603205526, + "language_loss": 0.70339674, + "learning_rate": 1.2985841078631871e-06, + "loss": 0.72543907, + "num_input_tokens_seen": 112043850, + "step": 5203, + "time_per_iteration": 3.6940512657165527 + }, + { + "auxiliary_loss_clip": 0.01089069, + "auxiliary_loss_mlp": 0.01084748, + "balance_loss_clip": 1.02150857, + "balance_loss_mlp": 1.00454438, + "epoch": 0.6257440028858293, + "flos": 24170574936960.0, + "grad_norm": 1.6764848546065467, + "language_loss": 0.78312516, + "learning_rate": 1.2978546659115608e-06, + "loss": 0.80486333, + "num_input_tokens_seen": 112061930, + "step": 5204, + "time_per_iteration": 3.832019805908203 + }, + { + "auxiliary_loss_clip": 0.01117911, + "auxiliary_loss_mlp": 0.01086069, + "balance_loss_clip": 1.02496457, + "balance_loss_mlp": 1.00586545, + "epoch": 0.6258642457764685, + "flos": 15851940289920.0, + "grad_norm": 1.844640522182473, + "language_loss": 0.85566765, + "learning_rate": 1.2971253304748228e-06, + "loss": 0.87770742, + "num_input_tokens_seen": 112079645, + "step": 5205, + "time_per_iteration": 2.6652934551239014 + }, + { + "auxiliary_loss_clip": 0.01127206, + "auxiliary_loss_mlp": 0.01084986, + "balance_loss_clip": 1.02664566, + "balance_loss_mlp": 1.00463915, + "epoch": 0.6259844886671075, + "flos": 11911354836480.0, + "grad_norm": 1.6773398979748244, + "language_loss": 0.7484858, + "learning_rate": 1.296396101663614e-06, + "loss": 0.77060777, + "num_input_tokens_seen": 112096205, + "step": 5206, + "time_per_iteration": 3.5442256927490234 + }, + { + "auxiliary_loss_clip": 0.01127636, + "auxiliary_loss_mlp": 0.01084751, + "balance_loss_clip": 1.02587295, + "balance_loss_mlp": 1.00440371, + "epoch": 0.6261047315577466, + "flos": 15887958652800.0, + "grad_norm": 1.9371486136646727, + "language_loss": 0.84102207, + "learning_rate": 1.2956669795885565e-06, + "loss": 0.86314601, + "num_input_tokens_seen": 112112835, + "step": 5207, + "time_per_iteration": 2.5887291431427 + }, + { + "auxiliary_loss_clip": 0.0110919, + "auxiliary_loss_mlp": 0.0108668, + "balance_loss_clip": 1.02479744, + "balance_loss_mlp": 1.00638068, + "epoch": 0.6262249744483858, + "flos": 31248926916480.0, + "grad_norm": 1.8120534280593965, + "language_loss": 0.67854321, + "learning_rate": 1.294937964360259e-06, + "loss": 0.70050192, + "num_input_tokens_seen": 112133105, + "step": 5208, + "time_per_iteration": 2.841846227645874 + }, + { + "auxiliary_loss_clip": 0.01102451, + "auxiliary_loss_mlp": 0.01085086, + "balance_loss_clip": 1.02515495, + "balance_loss_mlp": 1.00469172, + "epoch": 0.6263452173390248, + "flos": 27198598435200.0, + "grad_norm": 2.1896998198668287, + "language_loss": 0.71714687, + "learning_rate": 1.2942090560893108e-06, + "loss": 0.73902225, + "num_input_tokens_seen": 112152510, + "step": 5209, + "time_per_iteration": 2.76066255569458 + }, + { + "auxiliary_loss_clip": 0.01136875, + "auxiliary_loss_mlp": 0.01083704, + "balance_loss_clip": 1.02633071, + "balance_loss_mlp": 1.00354755, + "epoch": 0.6264654602296639, + "flos": 37342069683840.0, + "grad_norm": 1.8770661609362596, + "language_loss": 0.60028785, + "learning_rate": 1.2934802548862882e-06, + "loss": 0.62249362, + "num_input_tokens_seen": 112175295, + "step": 5210, + "time_per_iteration": 2.7304630279541016 + }, + { + "auxiliary_loss_clip": 0.01115379, + "auxiliary_loss_mlp": 0.01084124, + "balance_loss_clip": 1.02341461, + "balance_loss_mlp": 1.00382471, + "epoch": 0.626585703120303, + "flos": 14756952136320.0, + "grad_norm": 1.7985064084154365, + "language_loss": 0.82956856, + "learning_rate": 1.292751560861749e-06, + "loss": 0.85156357, + "num_input_tokens_seen": 112190200, + "step": 5211, + "time_per_iteration": 2.642524242401123 + }, + { + "auxiliary_loss_clip": 0.01137039, + "auxiliary_loss_mlp": 0.01084477, + "balance_loss_clip": 1.02638984, + "balance_loss_mlp": 1.00413013, + "epoch": 0.6267059460109421, + "flos": 22347318533760.0, + "grad_norm": 1.7612029448291187, + "language_loss": 0.79240859, + "learning_rate": 1.2920229741262354e-06, + "loss": 0.81462371, + "num_input_tokens_seen": 112208205, + "step": 5212, + "time_per_iteration": 2.616361379623413 + }, + { + "auxiliary_loss_clip": 0.01117418, + "auxiliary_loss_mlp": 0.01084525, + "balance_loss_clip": 1.02497625, + "balance_loss_mlp": 1.00427294, + "epoch": 0.6268261889015811, + "flos": 17748813617280.0, + "grad_norm": 1.785468392980286, + "language_loss": 0.75399286, + "learning_rate": 1.2912944947902739e-06, + "loss": 0.77601224, + "num_input_tokens_seen": 112224690, + "step": 5213, + "time_per_iteration": 2.642791748046875 + }, + { + "auxiliary_loss_clip": 0.01119626, + "auxiliary_loss_mlp": 0.01085144, + "balance_loss_clip": 1.02553892, + "balance_loss_mlp": 1.00474954, + "epoch": 0.6269464317922203, + "flos": 32846484211200.0, + "grad_norm": 3.7352080328938064, + "language_loss": 0.71299314, + "learning_rate": 1.2905661229643742e-06, + "loss": 0.73504084, + "num_input_tokens_seen": 112244450, + "step": 5214, + "time_per_iteration": 2.831049680709839 + }, + { + "auxiliary_loss_clip": 0.01136218, + "auxiliary_loss_mlp": 0.01084692, + "balance_loss_clip": 1.0257839, + "balance_loss_mlp": 1.00439262, + "epoch": 0.6270666746828594, + "flos": 17929192740480.0, + "grad_norm": 2.0000138960394866, + "language_loss": 0.84182453, + "learning_rate": 1.2898378587590299e-06, + "loss": 0.86403358, + "num_input_tokens_seen": 112261050, + "step": 5215, + "time_per_iteration": 2.594770908355713 + }, + { + "auxiliary_loss_clip": 0.0112612, + "auxiliary_loss_mlp": 0.01085452, + "balance_loss_clip": 1.02477133, + "balance_loss_mlp": 1.00515223, + "epoch": 0.6271869175734984, + "flos": 17457326749440.0, + "grad_norm": 1.7944704426056861, + "language_loss": 0.87827396, + "learning_rate": 1.2891097022847173e-06, + "loss": 0.90038967, + "num_input_tokens_seen": 112278395, + "step": 5216, + "time_per_iteration": 2.636439800262451 + }, + { + "auxiliary_loss_clip": 0.01116087, + "auxiliary_loss_mlp": 0.01085322, + "balance_loss_clip": 1.02437329, + "balance_loss_mlp": 1.00492716, + "epoch": 0.6273071604641376, + "flos": 26868615166080.0, + "grad_norm": 1.71896113361997, + "language_loss": 0.66586733, + "learning_rate": 1.2883816536518978e-06, + "loss": 0.68788141, + "num_input_tokens_seen": 112299535, + "step": 5217, + "time_per_iteration": 2.7755494117736816 + }, + { + "auxiliary_loss_clip": 0.0112695, + "auxiliary_loss_mlp": 0.01084013, + "balance_loss_clip": 1.02585089, + "balance_loss_mlp": 1.0039041, + "epoch": 0.6274274033547766, + "flos": 26062384446720.0, + "grad_norm": 1.7202585467984939, + "language_loss": 0.81833398, + "learning_rate": 1.2876537129710155e-06, + "loss": 0.84044361, + "num_input_tokens_seen": 112317265, + "step": 5218, + "time_per_iteration": 2.6793224811553955 + }, + { + "auxiliary_loss_clip": 0.01106593, + "auxiliary_loss_mlp": 0.01084302, + "balance_loss_clip": 1.02243328, + "balance_loss_mlp": 1.00400245, + "epoch": 0.6275476462454157, + "flos": 20266259241600.0, + "grad_norm": 2.0495680608280655, + "language_loss": 0.75506604, + "learning_rate": 1.286925880352499e-06, + "loss": 0.77697498, + "num_input_tokens_seen": 112336125, + "step": 5219, + "time_per_iteration": 2.7001776695251465 + }, + { + "auxiliary_loss_clip": 0.01112802, + "auxiliary_loss_mlp": 0.01083466, + "balance_loss_clip": 1.02168489, + "balance_loss_mlp": 1.00321472, + "epoch": 0.6276678891360549, + "flos": 26320402817280.0, + "grad_norm": 1.693176785994631, + "language_loss": 0.70973051, + "learning_rate": 1.2861981559067592e-06, + "loss": 0.73169327, + "num_input_tokens_seen": 112356730, + "step": 5220, + "time_per_iteration": 2.744826078414917 + }, + { + "auxiliary_loss_clip": 0.01098981, + "auxiliary_loss_mlp": 0.01083084, + "balance_loss_clip": 1.02369797, + "balance_loss_mlp": 1.00278425, + "epoch": 0.6277881320266939, + "flos": 13912512324480.0, + "grad_norm": 1.7488249258849, + "language_loss": 0.80202901, + "learning_rate": 1.2854705397441917e-06, + "loss": 0.82384962, + "num_input_tokens_seen": 112372270, + "step": 5221, + "time_per_iteration": 2.8147857189178467 + }, + { + "auxiliary_loss_clip": 0.0110686, + "auxiliary_loss_mlp": 0.01083719, + "balance_loss_clip": 1.02338052, + "balance_loss_mlp": 1.00356245, + "epoch": 0.627908374917333, + "flos": 27048922462080.0, + "grad_norm": 3.0038493652791267, + "language_loss": 0.77728617, + "learning_rate": 1.2847430319751747e-06, + "loss": 0.79919195, + "num_input_tokens_seen": 112390365, + "step": 5222, + "time_per_iteration": 2.7900550365448 + }, + { + "auxiliary_loss_clip": 0.01120519, + "auxiliary_loss_mlp": 0.01083822, + "balance_loss_clip": 1.02610111, + "balance_loss_mlp": 1.00352252, + "epoch": 0.6280286178079721, + "flos": 23769201386880.0, + "grad_norm": 2.054547674993909, + "language_loss": 0.6731981, + "learning_rate": 1.2840156327100712e-06, + "loss": 0.69524157, + "num_input_tokens_seen": 112407490, + "step": 5223, + "time_per_iteration": 2.676953077316284 + }, + { + "auxiliary_loss_clip": 0.01138115, + "auxiliary_loss_mlp": 0.01084302, + "balance_loss_clip": 1.02769303, + "balance_loss_mlp": 1.00409794, + "epoch": 0.6281488606986112, + "flos": 26359150613760.0, + "grad_norm": 1.7265551611983148, + "language_loss": 0.72236323, + "learning_rate": 1.2832883420592272e-06, + "loss": 0.74458742, + "num_input_tokens_seen": 112426385, + "step": 5224, + "time_per_iteration": 3.5226001739501953 + }, + { + "auxiliary_loss_clip": 0.01118522, + "auxiliary_loss_mlp": 0.01084711, + "balance_loss_clip": 1.02664983, + "balance_loss_mlp": 1.00436425, + "epoch": 0.6282691035892503, + "flos": 36137194848000.0, + "grad_norm": 2.1807277537084855, + "language_loss": 0.64173567, + "learning_rate": 1.282561160132972e-06, + "loss": 0.66376805, + "num_input_tokens_seen": 112446905, + "step": 5225, + "time_per_iteration": 2.935821056365967 + }, + { + "auxiliary_loss_clip": 0.01119374, + "auxiliary_loss_mlp": 0.01084633, + "balance_loss_clip": 1.02490354, + "balance_loss_mlp": 1.00438094, + "epoch": 0.6283893464798894, + "flos": 26537231266560.0, + "grad_norm": 1.509893513712171, + "language_loss": 0.80748808, + "learning_rate": 1.2818340870416186e-06, + "loss": 0.82952815, + "num_input_tokens_seen": 112468040, + "step": 5226, + "time_per_iteration": 2.8673110008239746 + }, + { + "auxiliary_loss_clip": 0.01110134, + "auxiliary_loss_mlp": 0.01084484, + "balance_loss_clip": 1.02405548, + "balance_loss_mlp": 1.00404179, + "epoch": 0.6285095893705285, + "flos": 22237216369920.0, + "grad_norm": 2.0982530686744907, + "language_loss": 0.76084268, + "learning_rate": 1.2811071228954626e-06, + "loss": 0.78278887, + "num_input_tokens_seen": 112486675, + "step": 5227, + "time_per_iteration": 2.7432289123535156 + }, + { + "auxiliary_loss_clip": 0.01118729, + "auxiliary_loss_mlp": 0.01084947, + "balance_loss_clip": 1.02556705, + "balance_loss_mlp": 1.00479078, + "epoch": 0.6286298322611675, + "flos": 26542259170560.0, + "grad_norm": 2.0721266765504165, + "language_loss": 0.80550539, + "learning_rate": 1.2803802678047846e-06, + "loss": 0.82754219, + "num_input_tokens_seen": 112506825, + "step": 5228, + "time_per_iteration": 2.8354811668395996 + }, + { + "auxiliary_loss_clip": 0.01116827, + "auxiliary_loss_mlp": 0.01086103, + "balance_loss_clip": 1.02487898, + "balance_loss_mlp": 1.00566018, + "epoch": 0.6287500751518067, + "flos": 21795227516160.0, + "grad_norm": 1.723847263695593, + "language_loss": 0.74150383, + "learning_rate": 1.279653521879848e-06, + "loss": 0.76353306, + "num_input_tokens_seen": 112526890, + "step": 5229, + "time_per_iteration": 4.619771480560303 + }, + { + "auxiliary_loss_clip": 0.01076876, + "auxiliary_loss_mlp": 0.01083831, + "balance_loss_clip": 1.01988006, + "balance_loss_mlp": 1.0036267, + "epoch": 0.6288703180424458, + "flos": 20009605587840.0, + "grad_norm": 1.8589217951132184, + "language_loss": 0.83789521, + "learning_rate": 1.2789268852308997e-06, + "loss": 0.85950232, + "num_input_tokens_seen": 112542100, + "step": 5230, + "time_per_iteration": 2.837517738342285 + }, + { + "auxiliary_loss_clip": 0.01129615, + "auxiliary_loss_mlp": 0.01085619, + "balance_loss_clip": 1.02730298, + "balance_loss_mlp": 1.00536764, + "epoch": 0.6289905609330848, + "flos": 22124923476480.0, + "grad_norm": 1.9063870990615275, + "language_loss": 0.70591229, + "learning_rate": 1.2782003579681688e-06, + "loss": 0.72806466, + "num_input_tokens_seen": 112561630, + "step": 5231, + "time_per_iteration": 3.6709742546081543 + }, + { + "auxiliary_loss_clip": 0.01136954, + "auxiliary_loss_mlp": 0.01084116, + "balance_loss_clip": 1.02639341, + "balance_loss_mlp": 1.00386429, + "epoch": 0.629110803823724, + "flos": 25518481729920.0, + "grad_norm": 2.00205236033348, + "language_loss": 0.74382818, + "learning_rate": 1.2774739402018701e-06, + "loss": 0.76603884, + "num_input_tokens_seen": 112582465, + "step": 5232, + "time_per_iteration": 2.7603061199188232 + }, + { + "auxiliary_loss_clip": 0.01121637, + "auxiliary_loss_mlp": 0.01085333, + "balance_loss_clip": 1.02249312, + "balance_loss_mlp": 1.00493813, + "epoch": 0.629231046714363, + "flos": 20886616056960.0, + "grad_norm": 1.675803779940099, + "language_loss": 0.73042047, + "learning_rate": 1.2767476320422002e-06, + "loss": 0.75249016, + "num_input_tokens_seen": 112602390, + "step": 5233, + "time_per_iteration": 2.6566593647003174 + }, + { + "auxiliary_loss_clip": 0.01092449, + "auxiliary_loss_mlp": 0.01079051, + "balance_loss_clip": 1.02038836, + "balance_loss_mlp": 1.00008702, + "epoch": 0.6293512896050021, + "flos": 65050027908480.0, + "grad_norm": 0.6972905825476965, + "language_loss": 0.57274544, + "learning_rate": 1.2760214335993392e-06, + "loss": 0.59446043, + "num_input_tokens_seen": 112669035, + "step": 5234, + "time_per_iteration": 3.4103691577911377 + }, + { + "auxiliary_loss_clip": 0.0112722, + "auxiliary_loss_mlp": 0.01084462, + "balance_loss_clip": 1.02539182, + "balance_loss_mlp": 1.00425792, + "epoch": 0.6294715324956413, + "flos": 34677857088000.0, + "grad_norm": 1.9882635885037845, + "language_loss": 0.59023595, + "learning_rate": 1.2752953449834514e-06, + "loss": 0.61235273, + "num_input_tokens_seen": 112691485, + "step": 5235, + "time_per_iteration": 2.7338857650756836 + }, + { + "auxiliary_loss_clip": 0.01135811, + "auxiliary_loss_mlp": 0.01085257, + "balance_loss_clip": 1.02553606, + "balance_loss_mlp": 1.0051012, + "epoch": 0.6295917753862803, + "flos": 22784207656320.0, + "grad_norm": 1.5687353317555097, + "language_loss": 0.80156672, + "learning_rate": 1.2745693663046836e-06, + "loss": 0.82377744, + "num_input_tokens_seen": 112710555, + "step": 5236, + "time_per_iteration": 2.630999803543091 + }, + { + "auxiliary_loss_clip": 0.01125427, + "auxiliary_loss_mlp": 0.01083484, + "balance_loss_clip": 1.02506316, + "balance_loss_mlp": 1.0033282, + "epoch": 0.6297120182769194, + "flos": 20850454039680.0, + "grad_norm": 1.6941016117813292, + "language_loss": 0.8083443, + "learning_rate": 1.2738434976731662e-06, + "loss": 0.83043349, + "num_input_tokens_seen": 112728740, + "step": 5237, + "time_per_iteration": 2.6290488243103027 + }, + { + "auxiliary_loss_clip": 0.01119824, + "auxiliary_loss_mlp": 0.01084573, + "balance_loss_clip": 1.02728188, + "balance_loss_mlp": 1.00432134, + "epoch": 0.6298322611675584, + "flos": 19497662997120.0, + "grad_norm": 1.5182725485172903, + "language_loss": 0.75062907, + "learning_rate": 1.2731177391990125e-06, + "loss": 0.77267301, + "num_input_tokens_seen": 112748665, + "step": 5238, + "time_per_iteration": 2.68487811088562 + }, + { + "auxiliary_loss_clip": 0.01115449, + "auxiliary_loss_mlp": 0.01084363, + "balance_loss_clip": 1.02306509, + "balance_loss_mlp": 1.0040164, + "epoch": 0.6299525040581976, + "flos": 12604466649600.0, + "grad_norm": 1.8887993057329757, + "language_loss": 0.81756419, + "learning_rate": 1.2723920909923203e-06, + "loss": 0.8395623, + "num_input_tokens_seen": 112764410, + "step": 5239, + "time_per_iteration": 2.6719565391540527 + }, + { + "auxiliary_loss_clip": 0.01116683, + "auxiliary_loss_mlp": 0.01079044, + "balance_loss_clip": 1.02079988, + "balance_loss_mlp": 1.00007975, + "epoch": 0.6300727469488366, + "flos": 57725685636480.0, + "grad_norm": 0.860107455568228, + "language_loss": 0.60503411, + "learning_rate": 1.2716665531631688e-06, + "loss": 0.62699139, + "num_input_tokens_seen": 112818695, + "step": 5240, + "time_per_iteration": 3.1457149982452393 + }, + { + "auxiliary_loss_clip": 0.01111181, + "auxiliary_loss_mlp": 0.01083877, + "balance_loss_clip": 1.02527893, + "balance_loss_mlp": 1.00357795, + "epoch": 0.6301929898394757, + "flos": 22527302607360.0, + "grad_norm": 1.810967072889062, + "language_loss": 0.7711854, + "learning_rate": 1.270941125821623e-06, + "loss": 0.79313588, + "num_input_tokens_seen": 112839120, + "step": 5241, + "time_per_iteration": 2.718137502670288 + }, + { + "auxiliary_loss_clip": 0.01127083, + "auxiliary_loss_mlp": 0.01084423, + "balance_loss_clip": 1.02484143, + "balance_loss_mlp": 1.00407553, + "epoch": 0.6303132327301149, + "flos": 28293550675200.0, + "grad_norm": 1.683982988603278, + "language_loss": 0.75502622, + "learning_rate": 1.2702158090777278e-06, + "loss": 0.77714133, + "num_input_tokens_seen": 112860210, + "step": 5242, + "time_per_iteration": 2.711902379989624 + }, + { + "auxiliary_loss_clip": 0.01109855, + "auxiliary_loss_mlp": 0.01084966, + "balance_loss_clip": 1.024894, + "balance_loss_mlp": 1.00476217, + "epoch": 0.6304334756207539, + "flos": 25264521596160.0, + "grad_norm": 1.708782644294752, + "language_loss": 0.74683106, + "learning_rate": 1.2694906030415148e-06, + "loss": 0.76877934, + "num_input_tokens_seen": 112877955, + "step": 5243, + "time_per_iteration": 2.8588526248931885 + }, + { + "auxiliary_loss_clip": 0.01119285, + "auxiliary_loss_mlp": 0.01084765, + "balance_loss_clip": 1.02485895, + "balance_loss_mlp": 1.00437021, + "epoch": 0.630553718511393, + "flos": 18033548728320.0, + "grad_norm": 2.401536394589691, + "language_loss": 0.81772304, + "learning_rate": 1.2687655078229958e-06, + "loss": 0.83976358, + "num_input_tokens_seen": 112892285, + "step": 5244, + "time_per_iteration": 2.66288161277771 + }, + { + "auxiliary_loss_clip": 0.01115477, + "auxiliary_loss_mlp": 0.01085224, + "balance_loss_clip": 1.02390528, + "balance_loss_mlp": 1.00497186, + "epoch": 0.6306739614020321, + "flos": 27304103658240.0, + "grad_norm": 1.9863968180528895, + "language_loss": 0.69094014, + "learning_rate": 1.2680405235321678e-06, + "loss": 0.71294713, + "num_input_tokens_seen": 112913620, + "step": 5245, + "time_per_iteration": 2.7912094593048096 + }, + { + "auxiliary_loss_clip": 0.01112633, + "auxiliary_loss_mlp": 0.00872931, + "balance_loss_clip": 1.02528179, + "balance_loss_mlp": 1.00008702, + "epoch": 0.6307942042926712, + "flos": 15341434243200.0, + "grad_norm": 2.373288328598164, + "language_loss": 0.78532553, + "learning_rate": 1.267315650279011e-06, + "loss": 0.80518115, + "num_input_tokens_seen": 112932090, + "step": 5246, + "time_per_iteration": 2.700540065765381 + }, + { + "auxiliary_loss_clip": 0.01098619, + "auxiliary_loss_mlp": 0.01083764, + "balance_loss_clip": 1.02182078, + "balance_loss_mlp": 1.00360703, + "epoch": 0.6309144471833102, + "flos": 19606400444160.0, + "grad_norm": 1.9136533590247335, + "language_loss": 0.74132383, + "learning_rate": 1.2665908881734874e-06, + "loss": 0.76314765, + "num_input_tokens_seen": 112950925, + "step": 5247, + "time_per_iteration": 2.786221981048584 + }, + { + "auxiliary_loss_clip": 0.01127076, + "auxiliary_loss_mlp": 0.01084005, + "balance_loss_clip": 1.02606153, + "balance_loss_mlp": 1.00384915, + "epoch": 0.6310346900739494, + "flos": 17493345112320.0, + "grad_norm": 1.9457414460516091, + "language_loss": 0.84883314, + "learning_rate": 1.2658662373255432e-06, + "loss": 0.8709439, + "num_input_tokens_seen": 112969315, + "step": 5248, + "time_per_iteration": 2.6882922649383545 + }, + { + "auxiliary_loss_clip": 0.01098369, + "auxiliary_loss_mlp": 0.01078905, + "balance_loss_clip": 1.01953685, + "balance_loss_mlp": 0.99994105, + "epoch": 0.6311549329645885, + "flos": 55070164131840.0, + "grad_norm": 0.7064961986488213, + "language_loss": 0.52261126, + "learning_rate": 1.2651416978451063e-06, + "loss": 0.54438406, + "num_input_tokens_seen": 113034700, + "step": 5249, + "time_per_iteration": 3.3380322456359863 + }, + { + "auxiliary_loss_clip": 0.0113787, + "auxiliary_loss_mlp": 0.01086072, + "balance_loss_clip": 1.0276916, + "balance_loss_mlp": 1.00572443, + "epoch": 0.6312751758552275, + "flos": 41902545075840.0, + "grad_norm": 1.7603207654444375, + "language_loss": 0.65255076, + "learning_rate": 1.2644172698420903e-06, + "loss": 0.6747902, + "num_input_tokens_seen": 113056805, + "step": 5250, + "time_per_iteration": 3.6610116958618164 + }, + { + "auxiliary_loss_clip": 0.01106451, + "auxiliary_loss_mlp": 0.01084062, + "balance_loss_clip": 1.02318704, + "balance_loss_mlp": 1.00371504, + "epoch": 0.6313954187458667, + "flos": 19646800266240.0, + "grad_norm": 1.6968302488175055, + "language_loss": 0.84633487, + "learning_rate": 1.2636929534263892e-06, + "loss": 0.86824, + "num_input_tokens_seen": 113075790, + "step": 5251, + "time_per_iteration": 2.7343971729278564 + }, + { + "auxiliary_loss_clip": 0.01109776, + "auxiliary_loss_mlp": 0.01084072, + "balance_loss_clip": 1.02398753, + "balance_loss_mlp": 1.00386763, + "epoch": 0.6315156616365057, + "flos": 22894273906560.0, + "grad_norm": 1.6410535468261815, + "language_loss": 0.77418017, + "learning_rate": 1.2629687487078821e-06, + "loss": 0.79611874, + "num_input_tokens_seen": 113094600, + "step": 5252, + "time_per_iteration": 2.74934720993042 + }, + { + "auxiliary_loss_clip": 0.01127172, + "auxiliary_loss_mlp": 0.01084922, + "balance_loss_clip": 1.02536058, + "balance_loss_mlp": 1.00462306, + "epoch": 0.6316359045271448, + "flos": 23726251699200.0, + "grad_norm": 1.998051552613835, + "language_loss": 0.7661221, + "learning_rate": 1.2622446557964293e-06, + "loss": 0.78824306, + "num_input_tokens_seen": 113112605, + "step": 5253, + "time_per_iteration": 2.6800546646118164 + }, + { + "auxiliary_loss_clip": 0.01118445, + "auxiliary_loss_mlp": 0.01083991, + "balance_loss_clip": 1.02428484, + "balance_loss_mlp": 1.00383472, + "epoch": 0.631756147417784, + "flos": 33108417164160.0, + "grad_norm": 5.969935705398276, + "language_loss": 0.71434391, + "learning_rate": 1.261520674801876e-06, + "loss": 0.7363683, + "num_input_tokens_seen": 113133200, + "step": 5254, + "time_per_iteration": 3.702977418899536 + }, + { + "auxiliary_loss_clip": 0.01111256, + "auxiliary_loss_mlp": 0.01084227, + "balance_loss_clip": 1.02176881, + "balance_loss_mlp": 1.00397491, + "epoch": 0.631876390308423, + "flos": 31248424126080.0, + "grad_norm": 1.9969415521291904, + "language_loss": 0.72404373, + "learning_rate": 1.2607968058340488e-06, + "loss": 0.7459985, + "num_input_tokens_seen": 113152895, + "step": 5255, + "time_per_iteration": 3.737154722213745 + }, + { + "auxiliary_loss_clip": 0.01116004, + "auxiliary_loss_mlp": 0.01084202, + "balance_loss_clip": 1.02383876, + "balance_loss_mlp": 1.00395036, + "epoch": 0.6319966331990621, + "flos": 24681152810880.0, + "grad_norm": 1.6844161727641167, + "language_loss": 0.72834969, + "learning_rate": 1.2600730490027583e-06, + "loss": 0.75035179, + "num_input_tokens_seen": 113173135, + "step": 5256, + "time_per_iteration": 3.5809550285339355 + }, + { + "auxiliary_loss_clip": 0.01107707, + "auxiliary_loss_mlp": 0.01085344, + "balance_loss_clip": 1.02388144, + "balance_loss_mlp": 1.00504422, + "epoch": 0.6321168760897012, + "flos": 17491764913920.0, + "grad_norm": 1.5771175133417106, + "language_loss": 0.80293143, + "learning_rate": 1.2593494044177984e-06, + "loss": 0.82486188, + "num_input_tokens_seen": 113191440, + "step": 5257, + "time_per_iteration": 2.6762866973876953 + }, + { + "auxiliary_loss_clip": 0.01135173, + "auxiliary_loss_mlp": 0.01084141, + "balance_loss_clip": 1.02481127, + "balance_loss_mlp": 1.00360346, + "epoch": 0.6322371189803403, + "flos": 18295373940480.0, + "grad_norm": 2.471305130325758, + "language_loss": 0.80296576, + "learning_rate": 1.2586258721889448e-06, + "loss": 0.82515883, + "num_input_tokens_seen": 113208790, + "step": 5258, + "time_per_iteration": 2.610037326812744 + }, + { + "auxiliary_loss_clip": 0.01091055, + "auxiliary_loss_mlp": 0.01085464, + "balance_loss_clip": 1.0226059, + "balance_loss_mlp": 1.00521207, + "epoch": 0.6323573618709794, + "flos": 20157270399360.0, + "grad_norm": 2.2227225634310996, + "language_loss": 0.81446034, + "learning_rate": 1.2579024524259573e-06, + "loss": 0.83622551, + "num_input_tokens_seen": 113225050, + "step": 5259, + "time_per_iteration": 2.8139495849609375 + }, + { + "auxiliary_loss_clip": 0.0111896, + "auxiliary_loss_mlp": 0.01084335, + "balance_loss_clip": 1.0250392, + "balance_loss_mlp": 1.00398767, + "epoch": 0.6324776047616185, + "flos": 20042391726720.0, + "grad_norm": 2.018540065305034, + "language_loss": 0.91349971, + "learning_rate": 1.2571791452385768e-06, + "loss": 0.93553269, + "num_input_tokens_seen": 113242315, + "step": 5260, + "time_per_iteration": 2.8917534351348877 + }, + { + "auxiliary_loss_clip": 0.01117479, + "auxiliary_loss_mlp": 0.01084978, + "balance_loss_clip": 1.02533531, + "balance_loss_mlp": 1.00482154, + "epoch": 0.6325978476522576, + "flos": 30848235724800.0, + "grad_norm": 1.5521748501303687, + "language_loss": 0.77296746, + "learning_rate": 1.2564559507365301e-06, + "loss": 0.79499203, + "num_input_tokens_seen": 113264720, + "step": 5261, + "time_per_iteration": 2.796069383621216 + }, + { + "auxiliary_loss_clip": 0.01112642, + "auxiliary_loss_mlp": 0.01084525, + "balance_loss_clip": 1.02531421, + "balance_loss_mlp": 1.00417805, + "epoch": 0.6327180905428966, + "flos": 24535104111360.0, + "grad_norm": 2.765845088137332, + "language_loss": 0.78814173, + "learning_rate": 1.2557328690295244e-06, + "loss": 0.81011337, + "num_input_tokens_seen": 113282910, + "step": 5262, + "time_per_iteration": 2.726491689682007 + }, + { + "auxiliary_loss_clip": 0.0109305, + "auxiliary_loss_mlp": 0.01086046, + "balance_loss_clip": 1.02519834, + "balance_loss_mlp": 1.00584197, + "epoch": 0.6328383334335358, + "flos": 21575274583680.0, + "grad_norm": 1.5736617753065436, + "language_loss": 0.76179665, + "learning_rate": 1.255009900227251e-06, + "loss": 0.78358769, + "num_input_tokens_seen": 113301935, + "step": 5263, + "time_per_iteration": 2.7883241176605225 + }, + { + "auxiliary_loss_clip": 0.0113552, + "auxiliary_loss_mlp": 0.01084829, + "balance_loss_clip": 1.02620316, + "balance_loss_mlp": 1.00467229, + "epoch": 0.6329585763241748, + "flos": 22929861306240.0, + "grad_norm": 1.7054098757121399, + "language_loss": 0.79227006, + "learning_rate": 1.254287044439383e-06, + "loss": 0.81447363, + "num_input_tokens_seen": 113321540, + "step": 5264, + "time_per_iteration": 2.5828535556793213 + }, + { + "auxiliary_loss_clip": 0.01115945, + "auxiliary_loss_mlp": 0.01078874, + "balance_loss_clip": 1.02020311, + "balance_loss_mlp": 0.99990934, + "epoch": 0.6330788192148139, + "flos": 70936897847040.0, + "grad_norm": 0.7710129583339121, + "language_loss": 0.54505467, + "learning_rate": 1.2535643017755776e-06, + "loss": 0.56700283, + "num_input_tokens_seen": 113383730, + "step": 5265, + "time_per_iteration": 3.312512159347534 + }, + { + "auxiliary_loss_clip": 0.01092847, + "auxiliary_loss_mlp": 0.01084877, + "balance_loss_clip": 1.02474928, + "balance_loss_mlp": 1.00452971, + "epoch": 0.6331990621054531, + "flos": 21244501215360.0, + "grad_norm": 2.6177782120268716, + "language_loss": 0.71666247, + "learning_rate": 1.2528416723454737e-06, + "loss": 0.73843968, + "num_input_tokens_seen": 113400400, + "step": 5266, + "time_per_iteration": 2.733846426010132 + }, + { + "auxiliary_loss_clip": 0.01135488, + "auxiliary_loss_mlp": 0.010844, + "balance_loss_clip": 1.02596951, + "balance_loss_mlp": 1.00424409, + "epoch": 0.6333193049960921, + "flos": 34459412526720.0, + "grad_norm": 1.4436180138215864, + "language_loss": 0.71052003, + "learning_rate": 1.2521191562586945e-06, + "loss": 0.73271888, + "num_input_tokens_seen": 113424050, + "step": 5267, + "time_per_iteration": 2.7672250270843506 + }, + { + "auxiliary_loss_clip": 0.01136613, + "auxiliary_loss_mlp": 0.00872927, + "balance_loss_clip": 1.02657473, + "balance_loss_mlp": 1.00012326, + "epoch": 0.6334395478867312, + "flos": 18329883932160.0, + "grad_norm": 2.044037629780542, + "language_loss": 0.7687062, + "learning_rate": 1.2513967536248445e-06, + "loss": 0.78880155, + "num_input_tokens_seen": 113440370, + "step": 5268, + "time_per_iteration": 2.612687587738037 + }, + { + "auxiliary_loss_clip": 0.01119726, + "auxiliary_loss_mlp": 0.01084417, + "balance_loss_clip": 1.02511358, + "balance_loss_mlp": 1.00411797, + "epoch": 0.6335597907773702, + "flos": 23623152687360.0, + "grad_norm": 2.0130826097946635, + "language_loss": 0.80801481, + "learning_rate": 1.2506744645535117e-06, + "loss": 0.83005619, + "num_input_tokens_seen": 113460800, + "step": 5269, + "time_per_iteration": 2.6546742916107178 + }, + { + "auxiliary_loss_clip": 0.01118223, + "auxiliary_loss_mlp": 0.01084128, + "balance_loss_clip": 1.02405405, + "balance_loss_mlp": 1.00382817, + "epoch": 0.6336800336680094, + "flos": 22710913954560.0, + "grad_norm": 1.8180015408344008, + "language_loss": 0.60363483, + "learning_rate": 1.249952289154267e-06, + "loss": 0.62565833, + "num_input_tokens_seen": 113480840, + "step": 5270, + "time_per_iteration": 2.7332141399383545 + }, + { + "auxiliary_loss_clip": 0.01090263, + "auxiliary_loss_mlp": 0.01085341, + "balance_loss_clip": 1.02317071, + "balance_loss_mlp": 1.00504196, + "epoch": 0.6338002765586485, + "flos": 23622757637760.0, + "grad_norm": 1.5955482120181648, + "language_loss": 0.76352769, + "learning_rate": 1.2492302275366635e-06, + "loss": 0.7852838, + "num_input_tokens_seen": 113500515, + "step": 5271, + "time_per_iteration": 2.8409969806671143 + }, + { + "auxiliary_loss_clip": 0.01127981, + "auxiliary_loss_mlp": 0.01084779, + "balance_loss_clip": 1.0256182, + "balance_loss_mlp": 1.00433612, + "epoch": 0.6339205194492875, + "flos": 26505450708480.0, + "grad_norm": 1.8711656291457262, + "language_loss": 0.65290391, + "learning_rate": 1.2485082798102377e-06, + "loss": 0.67503154, + "num_input_tokens_seen": 113520930, + "step": 5272, + "time_per_iteration": 2.709635019302368 + }, + { + "auxiliary_loss_clip": 0.01110396, + "auxiliary_loss_mlp": 0.01085881, + "balance_loss_clip": 1.0246172, + "balance_loss_mlp": 1.00553393, + "epoch": 0.6340407623399267, + "flos": 18544306170240.0, + "grad_norm": 2.1012948294983853, + "language_loss": 0.68120921, + "learning_rate": 1.2477864460845084e-06, + "loss": 0.70317197, + "num_input_tokens_seen": 113537330, + "step": 5273, + "time_per_iteration": 2.7286922931671143 + }, + { + "auxiliary_loss_clip": 0.01119429, + "auxiliary_loss_mlp": 0.0108453, + "balance_loss_clip": 1.02582037, + "balance_loss_mlp": 1.00403953, + "epoch": 0.6341610052305657, + "flos": 17712579772800.0, + "grad_norm": 2.2102862760160127, + "language_loss": 0.73500818, + "learning_rate": 1.2470647264689776e-06, + "loss": 0.75704777, + "num_input_tokens_seen": 113555810, + "step": 5274, + "time_per_iteration": 2.7038214206695557 + }, + { + "auxiliary_loss_clip": 0.01088863, + "auxiliary_loss_mlp": 0.01084923, + "balance_loss_clip": 1.02164567, + "balance_loss_mlp": 1.00471926, + "epoch": 0.6342812481212048, + "flos": 23587026583680.0, + "grad_norm": 1.9887813134083117, + "language_loss": 0.71406853, + "learning_rate": 1.2463431210731282e-06, + "loss": 0.73580647, + "num_input_tokens_seen": 113575395, + "step": 5275, + "time_per_iteration": 2.9018282890319824 + }, + { + "auxiliary_loss_clip": 0.0108479, + "auxiliary_loss_mlp": 0.0108524, + "balance_loss_clip": 1.02350593, + "balance_loss_mlp": 1.00494027, + "epoch": 0.634401491011844, + "flos": 17821927751040.0, + "grad_norm": 2.455087326782991, + "language_loss": 0.76516545, + "learning_rate": 1.2456216300064289e-06, + "loss": 0.78686571, + "num_input_tokens_seen": 113592945, + "step": 5276, + "time_per_iteration": 3.6388206481933594 + }, + { + "auxiliary_loss_clip": 0.01121652, + "auxiliary_loss_mlp": 0.01084976, + "balance_loss_clip": 1.02686739, + "balance_loss_mlp": 1.00467682, + "epoch": 0.634521733902483, + "flos": 21358158825600.0, + "grad_norm": 1.6890457875821911, + "language_loss": 0.78135121, + "learning_rate": 1.244900253378328e-06, + "loss": 0.8034175, + "num_input_tokens_seen": 113613000, + "step": 5277, + "time_per_iteration": 2.807295799255371 + }, + { + "auxiliary_loss_clip": 0.01051662, + "auxiliary_loss_mlp": 0.01084198, + "balance_loss_clip": 1.0200268, + "balance_loss_mlp": 1.00404131, + "epoch": 0.6346419767931221, + "flos": 16545052103040.0, + "grad_norm": 2.0684203847393277, + "language_loss": 0.69201183, + "learning_rate": 1.2441789912982583e-06, + "loss": 0.71337044, + "num_input_tokens_seen": 113630085, + "step": 5278, + "time_per_iteration": 2.897164821624756 + }, + { + "auxiliary_loss_clip": 0.01127803, + "auxiliary_loss_mlp": 0.01085208, + "balance_loss_clip": 1.02687097, + "balance_loss_mlp": 1.00490892, + "epoch": 0.6347622196837612, + "flos": 24350989973760.0, + "grad_norm": 1.7038355132689575, + "language_loss": 0.64971411, + "learning_rate": 1.2434578438756346e-06, + "loss": 0.67184418, + "num_input_tokens_seen": 113650515, + "step": 5279, + "time_per_iteration": 2.7737550735473633 + }, + { + "auxiliary_loss_clip": 0.01125882, + "auxiliary_loss_mlp": 0.01084162, + "balance_loss_clip": 1.02407789, + "balance_loss_mlp": 1.00390983, + "epoch": 0.6348824625744003, + "flos": 64523178195840.0, + "grad_norm": 2.776681666666266, + "language_loss": 0.77761257, + "learning_rate": 1.242736811219855e-06, + "loss": 0.79971302, + "num_input_tokens_seen": 113676475, + "step": 5280, + "time_per_iteration": 3.8814306259155273 + }, + { + "auxiliary_loss_clip": 0.01127963, + "auxiliary_loss_mlp": 0.01084231, + "balance_loss_clip": 1.02596474, + "balance_loss_mlp": 1.00402737, + "epoch": 0.6350027054650393, + "flos": 28622133313920.0, + "grad_norm": 1.7194025577890222, + "language_loss": 0.82246709, + "learning_rate": 1.2420158934402988e-06, + "loss": 0.84458905, + "num_input_tokens_seen": 113697090, + "step": 5281, + "time_per_iteration": 3.6226329803466797 + }, + { + "auxiliary_loss_clip": 0.01107433, + "auxiliary_loss_mlp": 0.01083936, + "balance_loss_clip": 1.0228734, + "balance_loss_mlp": 1.00363636, + "epoch": 0.6351229483556785, + "flos": 23002544476800.0, + "grad_norm": 2.059971455706597, + "language_loss": 0.8470884, + "learning_rate": 1.2412950906463286e-06, + "loss": 0.8690021, + "num_input_tokens_seen": 113714395, + "step": 5282, + "time_per_iteration": 3.6904289722442627 + }, + { + "auxiliary_loss_clip": 0.01096279, + "auxiliary_loss_mlp": 0.01084578, + "balance_loss_clip": 1.02245629, + "balance_loss_mlp": 1.00442135, + "epoch": 0.6352431912463176, + "flos": 21939300967680.0, + "grad_norm": 1.8643514612345171, + "language_loss": 0.89966279, + "learning_rate": 1.2405744029472902e-06, + "loss": 0.92147136, + "num_input_tokens_seen": 113733880, + "step": 5283, + "time_per_iteration": 2.8365650177001953 + }, + { + "auxiliary_loss_clip": 0.01116699, + "auxiliary_loss_mlp": 0.01083627, + "balance_loss_clip": 1.02473772, + "balance_loss_mlp": 1.00342298, + "epoch": 0.6353634341369566, + "flos": 13735257684480.0, + "grad_norm": 1.8175410086074595, + "language_loss": 0.76325887, + "learning_rate": 1.2398538304525108e-06, + "loss": 0.78526211, + "num_input_tokens_seen": 113752505, + "step": 5284, + "time_per_iteration": 2.686952590942383 + }, + { + "auxiliary_loss_clip": 0.01103184, + "auxiliary_loss_mlp": 0.01083892, + "balance_loss_clip": 1.02055717, + "balance_loss_mlp": 1.00359225, + "epoch": 0.6354836770275958, + "flos": 19316170552320.0, + "grad_norm": 2.222680746958146, + "language_loss": 0.75355285, + "learning_rate": 1.2391333732713016e-06, + "loss": 0.77542359, + "num_input_tokens_seen": 113770310, + "step": 5285, + "time_per_iteration": 2.925208806991577 + }, + { + "auxiliary_loss_clip": 0.01105097, + "auxiliary_loss_mlp": 0.01084703, + "balance_loss_clip": 1.02143264, + "balance_loss_mlp": 1.00440359, + "epoch": 0.6356039199182348, + "flos": 21613375935360.0, + "grad_norm": 2.678807928364009, + "language_loss": 0.7840637, + "learning_rate": 1.2384130315129543e-06, + "loss": 0.80596167, + "num_input_tokens_seen": 113788635, + "step": 5286, + "time_per_iteration": 2.9936368465423584 + }, + { + "auxiliary_loss_clip": 0.0105366, + "auxiliary_loss_mlp": 0.01084815, + "balance_loss_clip": 1.01926589, + "balance_loss_mlp": 1.00456357, + "epoch": 0.6357241628088739, + "flos": 18111978074880.0, + "grad_norm": 1.979359009584331, + "language_loss": 0.73266089, + "learning_rate": 1.2376928052867447e-06, + "loss": 0.75404561, + "num_input_tokens_seen": 113807755, + "step": 5287, + "time_per_iteration": 2.979797124862671 + }, + { + "auxiliary_loss_clip": 0.01116776, + "auxiliary_loss_mlp": 0.0108435, + "balance_loss_clip": 1.02440798, + "balance_loss_mlp": 1.00405073, + "epoch": 0.6358444056995131, + "flos": 24935256599040.0, + "grad_norm": 2.2467073888560956, + "language_loss": 0.77649057, + "learning_rate": 1.2369726947019299e-06, + "loss": 0.79850185, + "num_input_tokens_seen": 113828230, + "step": 5288, + "time_per_iteration": 2.9416260719299316 + }, + { + "auxiliary_loss_clip": 0.01125344, + "auxiliary_loss_mlp": 0.01084558, + "balance_loss_clip": 1.02410746, + "balance_loss_mlp": 1.00440145, + "epoch": 0.6359646485901521, + "flos": 23293348986240.0, + "grad_norm": 2.0677568762271887, + "language_loss": 0.67095482, + "learning_rate": 1.2362526998677511e-06, + "loss": 0.69305384, + "num_input_tokens_seen": 113844595, + "step": 5289, + "time_per_iteration": 2.7094931602478027 + }, + { + "auxiliary_loss_clip": 0.01116549, + "auxiliary_loss_mlp": 0.01084368, + "balance_loss_clip": 1.02296627, + "balance_loss_mlp": 1.00416386, + "epoch": 0.6360848914807912, + "flos": 20887442069760.0, + "grad_norm": 1.7593114052887837, + "language_loss": 0.84230489, + "learning_rate": 1.2355328208934301e-06, + "loss": 0.86431408, + "num_input_tokens_seen": 113863470, + "step": 5290, + "time_per_iteration": 2.760063648223877 + }, + { + "auxiliary_loss_clip": 0.01127297, + "auxiliary_loss_mlp": 0.00872885, + "balance_loss_clip": 1.02541637, + "balance_loss_mlp": 1.00008464, + "epoch": 0.6362051343714303, + "flos": 18479775386880.0, + "grad_norm": 1.5816625673680007, + "language_loss": 0.72117454, + "learning_rate": 1.2348130578881728e-06, + "loss": 0.74117637, + "num_input_tokens_seen": 113881690, + "step": 5291, + "time_per_iteration": 2.6332507133483887 + }, + { + "auxiliary_loss_clip": 0.01136517, + "auxiliary_loss_mlp": 0.01084553, + "balance_loss_clip": 1.02632976, + "balance_loss_mlp": 1.00425351, + "epoch": 0.6363253772620694, + "flos": 24389594115840.0, + "grad_norm": 2.108509373505924, + "language_loss": 0.76275438, + "learning_rate": 1.2340934109611664e-06, + "loss": 0.7849651, + "num_input_tokens_seen": 113902450, + "step": 5292, + "time_per_iteration": 2.6336092948913574 + }, + { + "auxiliary_loss_clip": 0.01112946, + "auxiliary_loss_mlp": 0.01085867, + "balance_loss_clip": 1.02172112, + "balance_loss_mlp": 1.0054723, + "epoch": 0.6364456201527084, + "flos": 25958243940480.0, + "grad_norm": 2.528552730022968, + "language_loss": 0.68250453, + "learning_rate": 1.2333738802215798e-06, + "loss": 0.70449269, + "num_input_tokens_seen": 113922670, + "step": 5293, + "time_per_iteration": 2.708599090576172 + }, + { + "auxiliary_loss_clip": 0.01100675, + "auxiliary_loss_mlp": 0.01084483, + "balance_loss_clip": 1.02437782, + "balance_loss_mlp": 1.00427938, + "epoch": 0.6365658630433476, + "flos": 20740711011840.0, + "grad_norm": 2.1428335480541905, + "language_loss": 0.80810279, + "learning_rate": 1.2326544657785668e-06, + "loss": 0.82995439, + "num_input_tokens_seen": 113942360, + "step": 5294, + "time_per_iteration": 2.8337297439575195 + }, + { + "auxiliary_loss_clip": 0.01110815, + "auxiliary_loss_mlp": 0.01085362, + "balance_loss_clip": 1.02511322, + "balance_loss_mlp": 1.00511026, + "epoch": 0.6366861059339867, + "flos": 21434146047360.0, + "grad_norm": 2.3342659611852166, + "language_loss": 0.74347663, + "learning_rate": 1.2319351677412608e-06, + "loss": 0.76543844, + "num_input_tokens_seen": 113959405, + "step": 5295, + "time_per_iteration": 2.7146613597869873 + }, + { + "auxiliary_loss_clip": 0.01108666, + "auxiliary_loss_mlp": 0.01086042, + "balance_loss_clip": 1.02381849, + "balance_loss_mlp": 1.00569522, + "epoch": 0.6368063488246257, + "flos": 22267093507200.0, + "grad_norm": 1.8046345918736064, + "language_loss": 0.7433843, + "learning_rate": 1.2312159862187796e-06, + "loss": 0.76533139, + "num_input_tokens_seen": 113977815, + "step": 5296, + "time_per_iteration": 2.807863473892212 + }, + { + "auxiliary_loss_clip": 0.01137651, + "auxiliary_loss_mlp": 0.01086116, + "balance_loss_clip": 1.02716768, + "balance_loss_mlp": 1.00581694, + "epoch": 0.6369265917152649, + "flos": 22420719976320.0, + "grad_norm": 1.4644848546349478, + "language_loss": 0.76325446, + "learning_rate": 1.2304969213202217e-06, + "loss": 0.78549212, + "num_input_tokens_seen": 113999075, + "step": 5297, + "time_per_iteration": 2.6601524353027344 + }, + { + "auxiliary_loss_clip": 0.01117916, + "auxiliary_loss_mlp": 0.01084269, + "balance_loss_clip": 1.02436185, + "balance_loss_mlp": 1.00406551, + "epoch": 0.6370468346059039, + "flos": 24718176754560.0, + "grad_norm": 3.014115785846901, + "language_loss": 0.79642582, + "learning_rate": 1.2297779731546692e-06, + "loss": 0.81844771, + "num_input_tokens_seen": 114018170, + "step": 5298, + "time_per_iteration": 2.7069764137268066 + }, + { + "auxiliary_loss_clip": 0.0111587, + "auxiliary_loss_mlp": 0.01086229, + "balance_loss_clip": 1.02462196, + "balance_loss_mlp": 1.00602531, + "epoch": 0.637167077496543, + "flos": 25296589463040.0, + "grad_norm": 2.0526067890472466, + "language_loss": 0.77964282, + "learning_rate": 1.2290591418311853e-06, + "loss": 0.80166388, + "num_input_tokens_seen": 114035565, + "step": 5299, + "time_per_iteration": 2.720184087753296 + }, + { + "auxiliary_loss_clip": 0.0112136, + "auxiliary_loss_mlp": 0.01084656, + "balance_loss_clip": 1.02198887, + "balance_loss_mlp": 1.0043565, + "epoch": 0.637287320387182, + "flos": 27671110871040.0, + "grad_norm": 1.5707548405397, + "language_loss": 0.71915495, + "learning_rate": 1.2283404274588172e-06, + "loss": 0.74121511, + "num_input_tokens_seen": 114054510, + "step": 5300, + "time_per_iteration": 2.7094879150390625 + }, + { + "auxiliary_loss_clip": 0.01061925, + "auxiliary_loss_mlp": 0.01079264, + "balance_loss_clip": 1.01599908, + "balance_loss_mlp": 1.00030029, + "epoch": 0.6374075632778212, + "flos": 63173406873600.0, + "grad_norm": 0.7414555249634327, + "language_loss": 0.52898455, + "learning_rate": 1.227621830146592e-06, + "loss": 0.5503965, + "num_input_tokens_seen": 114109875, + "step": 5301, + "time_per_iteration": 4.244406461715698 + }, + { + "auxiliary_loss_clip": 0.01103601, + "auxiliary_loss_mlp": 0.01084695, + "balance_loss_clip": 1.02096164, + "balance_loss_mlp": 1.00449133, + "epoch": 0.6375278061684603, + "flos": 25558127366400.0, + "grad_norm": 1.9073214875303648, + "language_loss": 0.79110301, + "learning_rate": 1.2269033500035217e-06, + "loss": 0.81298602, + "num_input_tokens_seen": 114130010, + "step": 5302, + "time_per_iteration": 2.8069214820861816 + }, + { + "auxiliary_loss_clip": 0.01085729, + "auxiliary_loss_mlp": 0.01084523, + "balance_loss_clip": 1.02420282, + "balance_loss_mlp": 1.00427163, + "epoch": 0.6376480490590993, + "flos": 25666362023040.0, + "grad_norm": 1.8624840744198388, + "language_loss": 0.73468351, + "learning_rate": 1.2261849871385988e-06, + "loss": 0.75638604, + "num_input_tokens_seen": 114151115, + "step": 5303, + "time_per_iteration": 2.7689340114593506 + }, + { + "auxiliary_loss_clip": 0.01136314, + "auxiliary_loss_mlp": 0.01084422, + "balance_loss_clip": 1.02609599, + "balance_loss_mlp": 1.00417006, + "epoch": 0.6377682919497385, + "flos": 31537684350720.0, + "grad_norm": 2.35477425828301, + "language_loss": 0.62701362, + "learning_rate": 1.2254667416607972e-06, + "loss": 0.64922094, + "num_input_tokens_seen": 114172715, + "step": 5304, + "time_per_iteration": 2.8398241996765137 + }, + { + "auxiliary_loss_clip": 0.01125647, + "auxiliary_loss_mlp": 0.01084172, + "balance_loss_clip": 1.02505529, + "balance_loss_mlp": 1.003968, + "epoch": 0.6378885348403776, + "flos": 23039209284480.0, + "grad_norm": 1.821882878004379, + "language_loss": 0.82588738, + "learning_rate": 1.2247486136790756e-06, + "loss": 0.84798563, + "num_input_tokens_seen": 114192195, + "step": 5305, + "time_per_iteration": 4.43564772605896 + }, + { + "auxiliary_loss_clip": 0.01126333, + "auxiliary_loss_mlp": 0.01086059, + "balance_loss_clip": 1.02475047, + "balance_loss_mlp": 1.00585485, + "epoch": 0.6380087777310166, + "flos": 18697070712960.0, + "grad_norm": 1.9689926512735858, + "language_loss": 0.80715275, + "learning_rate": 1.2240306033023726e-06, + "loss": 0.82927668, + "num_input_tokens_seen": 114210020, + "step": 5306, + "time_per_iteration": 2.6853115558624268 + }, + { + "auxiliary_loss_clip": 0.01110265, + "auxiliary_loss_mlp": 0.01084189, + "balance_loss_clip": 1.02409089, + "balance_loss_mlp": 1.00393736, + "epoch": 0.6381290206216558, + "flos": 23331558078720.0, + "grad_norm": 1.8719699849910294, + "language_loss": 0.72041917, + "learning_rate": 1.223312710639611e-06, + "loss": 0.74236369, + "num_input_tokens_seen": 114228740, + "step": 5307, + "time_per_iteration": 3.7280712127685547 + }, + { + "auxiliary_loss_clip": 0.01119286, + "auxiliary_loss_mlp": 0.01084766, + "balance_loss_clip": 1.02641058, + "balance_loss_mlp": 1.00451469, + "epoch": 0.6382492635122948, + "flos": 18880466578560.0, + "grad_norm": 1.8772862655757394, + "language_loss": 0.86706018, + "learning_rate": 1.2225949357996928e-06, + "loss": 0.88910067, + "num_input_tokens_seen": 114246865, + "step": 5308, + "time_per_iteration": 2.691462993621826 + }, + { + "auxiliary_loss_clip": 0.01125756, + "auxiliary_loss_mlp": 0.01083225, + "balance_loss_clip": 1.02477241, + "balance_loss_mlp": 1.00302076, + "epoch": 0.6383695064029339, + "flos": 27819134818560.0, + "grad_norm": 1.8303061537524907, + "language_loss": 0.80321252, + "learning_rate": 1.221877278891505e-06, + "loss": 0.82530236, + "num_input_tokens_seen": 114266120, + "step": 5309, + "time_per_iteration": 2.728282928466797 + }, + { + "auxiliary_loss_clip": 0.01112092, + "auxiliary_loss_mlp": 0.01084275, + "balance_loss_clip": 1.02694273, + "balance_loss_mlp": 1.00392771, + "epoch": 0.638489749293573, + "flos": 26395635853440.0, + "grad_norm": 2.9238396989177895, + "language_loss": 0.7127831, + "learning_rate": 1.221159740023915e-06, + "loss": 0.73474681, + "num_input_tokens_seen": 114285950, + "step": 5310, + "time_per_iteration": 2.6578078269958496 + }, + { + "auxiliary_loss_clip": 0.01108905, + "auxiliary_loss_mlp": 0.00872946, + "balance_loss_clip": 1.0243535, + "balance_loss_mlp": 1.00007224, + "epoch": 0.6386099921842121, + "flos": 23988328306560.0, + "grad_norm": 1.737923681545422, + "language_loss": 0.72145993, + "learning_rate": 1.2204423193057735e-06, + "loss": 0.74127853, + "num_input_tokens_seen": 114304780, + "step": 5311, + "time_per_iteration": 2.8418867588043213 + }, + { + "auxiliary_loss_clip": 0.01098814, + "auxiliary_loss_mlp": 0.01078986, + "balance_loss_clip": 1.01940536, + "balance_loss_mlp": 1.00002146, + "epoch": 0.6387302350748512, + "flos": 71731169337600.0, + "grad_norm": 0.8560638400420668, + "language_loss": 0.63366008, + "learning_rate": 1.2197250168459122e-06, + "loss": 0.65543807, + "num_input_tokens_seen": 114361180, + "step": 5312, + "time_per_iteration": 3.3379006385803223 + }, + { + "auxiliary_loss_clip": 0.01128018, + "auxiliary_loss_mlp": 0.01084075, + "balance_loss_clip": 1.02609897, + "balance_loss_mlp": 1.00387132, + "epoch": 0.6388504779654903, + "flos": 14535778141440.0, + "grad_norm": 1.739757092808246, + "language_loss": 0.74068534, + "learning_rate": 1.2190078327531454e-06, + "loss": 0.7628063, + "num_input_tokens_seen": 114377425, + "step": 5313, + "time_per_iteration": 2.6381869316101074 + }, + { + "auxiliary_loss_clip": 0.01127398, + "auxiliary_loss_mlp": 0.01085026, + "balance_loss_clip": 1.02568007, + "balance_loss_mlp": 1.00482225, + "epoch": 0.6389707208561294, + "flos": 22346133384960.0, + "grad_norm": 1.446358349017398, + "language_loss": 0.7277146, + "learning_rate": 1.2182907671362697e-06, + "loss": 0.74983883, + "num_input_tokens_seen": 114398120, + "step": 5314, + "time_per_iteration": 2.7216367721557617 + }, + { + "auxiliary_loss_clip": 0.01125661, + "auxiliary_loss_mlp": 0.01085257, + "balance_loss_clip": 1.02572393, + "balance_loss_mlp": 1.00500548, + "epoch": 0.6390909637467684, + "flos": 19426883247360.0, + "grad_norm": 1.8179193468558266, + "language_loss": 0.78670949, + "learning_rate": 1.2175738201040626e-06, + "loss": 0.80881864, + "num_input_tokens_seen": 114415160, + "step": 5315, + "time_per_iteration": 2.595409393310547 + }, + { + "auxiliary_loss_clip": 0.01124092, + "auxiliary_loss_mlp": 0.01083802, + "balance_loss_clip": 1.02383065, + "balance_loss_mlp": 1.00359797, + "epoch": 0.6392112066374076, + "flos": 24090852700800.0, + "grad_norm": 2.197261432781726, + "language_loss": 0.78179073, + "learning_rate": 1.2168569917652855e-06, + "loss": 0.80386972, + "num_input_tokens_seen": 114435015, + "step": 5316, + "time_per_iteration": 2.8366360664367676 + }, + { + "auxiliary_loss_clip": 0.0112628, + "auxiliary_loss_mlp": 0.01084659, + "balance_loss_clip": 1.02506113, + "balance_loss_mlp": 1.00440741, + "epoch": 0.6393314495280467, + "flos": 26795141896320.0, + "grad_norm": 1.9533344502579988, + "language_loss": 0.63611281, + "learning_rate": 1.2161402822286797e-06, + "loss": 0.6582222, + "num_input_tokens_seen": 114455700, + "step": 5317, + "time_per_iteration": 2.7368648052215576 + }, + { + "auxiliary_loss_clip": 0.01103, + "auxiliary_loss_mlp": 0.01083293, + "balance_loss_clip": 1.02445495, + "balance_loss_mlp": 1.00308943, + "epoch": 0.6394516924186857, + "flos": 20260692633600.0, + "grad_norm": 2.2172421152667092, + "language_loss": 0.78734142, + "learning_rate": 1.2154236916029703e-06, + "loss": 0.80920434, + "num_input_tokens_seen": 114473675, + "step": 5318, + "time_per_iteration": 2.8439836502075195 + }, + { + "auxiliary_loss_clip": 0.01098436, + "auxiliary_loss_mlp": 0.01084173, + "balance_loss_clip": 1.02197099, + "balance_loss_mlp": 1.00396848, + "epoch": 0.6395719353093249, + "flos": 18368847210240.0, + "grad_norm": 2.374539643326892, + "language_loss": 0.73575306, + "learning_rate": 1.2147072199968627e-06, + "loss": 0.75757909, + "num_input_tokens_seen": 114492310, + "step": 5319, + "time_per_iteration": 2.7936594486236572 + }, + { + "auxiliary_loss_clip": 0.011246, + "auxiliary_loss_mlp": 0.01084233, + "balance_loss_clip": 1.02408469, + "balance_loss_mlp": 1.00412464, + "epoch": 0.6396921781999639, + "flos": 17566315591680.0, + "grad_norm": 1.7961002396060097, + "language_loss": 0.71817076, + "learning_rate": 1.2139908675190454e-06, + "loss": 0.74025905, + "num_input_tokens_seen": 114511520, + "step": 5320, + "time_per_iteration": 2.670728921890259 + }, + { + "auxiliary_loss_clip": 0.0108892, + "auxiliary_loss_mlp": 0.01083583, + "balance_loss_clip": 1.02286172, + "balance_loss_mlp": 1.00347471, + "epoch": 0.639812421090603, + "flos": 21251252972160.0, + "grad_norm": 1.8282139245131344, + "language_loss": 0.75048888, + "learning_rate": 1.2132746342781883e-06, + "loss": 0.77221394, + "num_input_tokens_seen": 114532680, + "step": 5321, + "time_per_iteration": 2.8514628410339355 + }, + { + "auxiliary_loss_clip": 0.01134983, + "auxiliary_loss_mlp": 0.01085003, + "balance_loss_clip": 1.02495527, + "balance_loss_mlp": 1.00470376, + "epoch": 0.6399326639812422, + "flos": 11180967684480.0, + "grad_norm": 2.3229275443634148, + "language_loss": 0.79968333, + "learning_rate": 1.2125585203829442e-06, + "loss": 0.82188326, + "num_input_tokens_seen": 114548320, + "step": 5322, + "time_per_iteration": 2.616487503051758 + }, + { + "auxiliary_loss_clip": 0.01102199, + "auxiliary_loss_mlp": 0.01086329, + "balance_loss_clip": 1.02482677, + "balance_loss_mlp": 1.00607693, + "epoch": 0.6400529068718812, + "flos": 23911048195200.0, + "grad_norm": 1.6900573068621219, + "language_loss": 0.74141324, + "learning_rate": 1.211842525941946e-06, + "loss": 0.76329851, + "num_input_tokens_seen": 114568115, + "step": 5323, + "time_per_iteration": 2.7306642532348633 + }, + { + "auxiliary_loss_clip": 0.01094964, + "auxiliary_loss_mlp": 0.0108427, + "balance_loss_clip": 1.0216949, + "balance_loss_mlp": 1.00401807, + "epoch": 0.6401731497625203, + "flos": 44018724890880.0, + "grad_norm": 2.2559900013738683, + "language_loss": 0.78839868, + "learning_rate": 1.2111266510638105e-06, + "loss": 0.81019104, + "num_input_tokens_seen": 114591040, + "step": 5324, + "time_per_iteration": 2.915853261947632 + }, + { + "auxiliary_loss_clip": 0.01084243, + "auxiliary_loss_mlp": 0.01083772, + "balance_loss_clip": 1.02306509, + "balance_loss_mlp": 1.00352049, + "epoch": 0.6402933926531594, + "flos": 20662209838080.0, + "grad_norm": 1.8914352756028527, + "language_loss": 0.79990011, + "learning_rate": 1.2104108958571346e-06, + "loss": 0.82158023, + "num_input_tokens_seen": 114609310, + "step": 5325, + "time_per_iteration": 2.8204843997955322 + }, + { + "auxiliary_loss_clip": 0.0112482, + "auxiliary_loss_mlp": 0.01083908, + "balance_loss_clip": 1.02510273, + "balance_loss_mlp": 1.00375128, + "epoch": 0.6404136355437985, + "flos": 24863327614080.0, + "grad_norm": 1.4701777450512263, + "language_loss": 0.75775385, + "learning_rate": 1.2096952604304975e-06, + "loss": 0.77984118, + "num_input_tokens_seen": 114629740, + "step": 5326, + "time_per_iteration": 3.5547008514404297 + }, + { + "auxiliary_loss_clip": 0.0112522, + "auxiliary_loss_mlp": 0.01084235, + "balance_loss_clip": 1.0243423, + "balance_loss_mlp": 1.00393605, + "epoch": 0.6405338784344375, + "flos": 40479548901120.0, + "grad_norm": 2.0841434457624546, + "language_loss": 0.70232761, + "learning_rate": 1.2089797448924616e-06, + "loss": 0.72442216, + "num_input_tokens_seen": 114653615, + "step": 5327, + "time_per_iteration": 2.8046605587005615 + }, + { + "auxiliary_loss_clip": 0.01086252, + "auxiliary_loss_mlp": 0.01084818, + "balance_loss_clip": 1.02467537, + "balance_loss_mlp": 1.00447118, + "epoch": 0.6406541213250767, + "flos": 20886041439360.0, + "grad_norm": 2.0901985301692, + "language_loss": 0.65677738, + "learning_rate": 1.2082643493515692e-06, + "loss": 0.67848814, + "num_input_tokens_seen": 114671935, + "step": 5328, + "time_per_iteration": 2.738687515258789 + }, + { + "auxiliary_loss_clip": 0.01125474, + "auxiliary_loss_mlp": 0.01083914, + "balance_loss_clip": 1.02488697, + "balance_loss_mlp": 1.00370991, + "epoch": 0.6407743642157158, + "flos": 23295970679040.0, + "grad_norm": 1.7092960633951921, + "language_loss": 0.82008314, + "learning_rate": 1.207549073916346e-06, + "loss": 0.84217697, + "num_input_tokens_seen": 114692870, + "step": 5329, + "time_per_iteration": 2.683988332748413 + }, + { + "auxiliary_loss_clip": 0.01118659, + "auxiliary_loss_mlp": 0.01084806, + "balance_loss_clip": 1.02644682, + "balance_loss_mlp": 1.00460172, + "epoch": 0.6408946071063548, + "flos": 15012636122880.0, + "grad_norm": 1.8210271600979095, + "language_loss": 0.77726638, + "learning_rate": 1.2068339186952976e-06, + "loss": 0.79930103, + "num_input_tokens_seen": 114710410, + "step": 5330, + "time_per_iteration": 3.700350046157837 + }, + { + "auxiliary_loss_clip": 0.01127396, + "auxiliary_loss_mlp": 0.01083895, + "balance_loss_clip": 1.02547419, + "balance_loss_mlp": 1.00364327, + "epoch": 0.6410148499969939, + "flos": 22528595496960.0, + "grad_norm": 1.8674233266795095, + "language_loss": 0.73342633, + "learning_rate": 1.2061188837969136e-06, + "loss": 0.75553924, + "num_input_tokens_seen": 114730020, + "step": 5331, + "time_per_iteration": 3.6269564628601074 + }, + { + "auxiliary_loss_clip": 0.01107558, + "auxiliary_loss_mlp": 0.01085351, + "balance_loss_clip": 1.02290821, + "balance_loss_mlp": 1.00495648, + "epoch": 0.641135092887633, + "flos": 12422004537600.0, + "grad_norm": 2.34746417490353, + "language_loss": 0.83630967, + "learning_rate": 1.2054039693296631e-06, + "loss": 0.85823882, + "num_input_tokens_seen": 114748015, + "step": 5332, + "time_per_iteration": 2.7502057552337646 + }, + { + "auxiliary_loss_clip": 0.011036, + "auxiliary_loss_mlp": 0.01084466, + "balance_loss_clip": 1.02093935, + "balance_loss_mlp": 1.00431001, + "epoch": 0.6412553357782721, + "flos": 22127329687680.0, + "grad_norm": 1.6407798402036406, + "language_loss": 0.81354642, + "learning_rate": 1.2046891754019992e-06, + "loss": 0.83542705, + "num_input_tokens_seen": 114768625, + "step": 5333, + "time_per_iteration": 3.696946620941162 + }, + { + "auxiliary_loss_clip": 0.01127744, + "auxiliary_loss_mlp": 0.01084468, + "balance_loss_clip": 1.02609956, + "balance_loss_mlp": 1.00416827, + "epoch": 0.6413755786689112, + "flos": 15888605097600.0, + "grad_norm": 1.732678272597338, + "language_loss": 0.8266474, + "learning_rate": 1.2039745021223548e-06, + "loss": 0.84876955, + "num_input_tokens_seen": 114786045, + "step": 5334, + "time_per_iteration": 2.603799819946289 + }, + { + "auxiliary_loss_clip": 0.01080715, + "auxiliary_loss_mlp": 0.01079232, + "balance_loss_clip": 1.02691019, + "balance_loss_mlp": 1.00026822, + "epoch": 0.6414958215595503, + "flos": 68039159955840.0, + "grad_norm": 0.8181183700174723, + "language_loss": 0.5710932, + "learning_rate": 1.2032599495991456e-06, + "loss": 0.59269267, + "num_input_tokens_seen": 114850785, + "step": 5335, + "time_per_iteration": 3.4268734455108643 + }, + { + "auxiliary_loss_clip": 0.01125761, + "auxiliary_loss_mlp": 0.01084967, + "balance_loss_clip": 1.02552676, + "balance_loss_mlp": 1.00466728, + "epoch": 0.6416160644501894, + "flos": 44091300320640.0, + "grad_norm": 1.6632730078912736, + "language_loss": 0.69539034, + "learning_rate": 1.2025455179407685e-06, + "loss": 0.71749765, + "num_input_tokens_seen": 114871945, + "step": 5336, + "time_per_iteration": 2.8651702404022217 + }, + { + "auxiliary_loss_clip": 0.01124901, + "auxiliary_loss_mlp": 0.008729, + "balance_loss_clip": 1.02401567, + "balance_loss_mlp": 1.00007391, + "epoch": 0.6417363073408284, + "flos": 20959837931520.0, + "grad_norm": 1.8437908265351806, + "language_loss": 0.73768133, + "learning_rate": 1.2018312072556022e-06, + "loss": 0.75765932, + "num_input_tokens_seen": 114890445, + "step": 5337, + "time_per_iteration": 2.6175661087036133 + }, + { + "auxiliary_loss_clip": 0.01135738, + "auxiliary_loss_mlp": 0.00872874, + "balance_loss_clip": 1.02606082, + "balance_loss_mlp": 1.00007796, + "epoch": 0.6418565502314676, + "flos": 22455122227200.0, + "grad_norm": 1.7788429591263302, + "language_loss": 0.74293363, + "learning_rate": 1.2011170176520077e-06, + "loss": 0.7630198, + "num_input_tokens_seen": 114911360, + "step": 5338, + "time_per_iteration": 2.633385419845581 + }, + { + "auxiliary_loss_clip": 0.01087419, + "auxiliary_loss_mlp": 0.01084745, + "balance_loss_clip": 1.02185333, + "balance_loss_mlp": 1.00449324, + "epoch": 0.6419767931221066, + "flos": 25045502417280.0, + "grad_norm": 1.5419173095963954, + "language_loss": 0.81254327, + "learning_rate": 1.2004029492383256e-06, + "loss": 0.83426493, + "num_input_tokens_seen": 114932700, + "step": 5339, + "time_per_iteration": 2.8420114517211914 + }, + { + "auxiliary_loss_clip": 0.01124888, + "auxiliary_loss_mlp": 0.01086351, + "balance_loss_clip": 1.02455413, + "balance_loss_mlp": 1.00609887, + "epoch": 0.6420970360127457, + "flos": 19463691709440.0, + "grad_norm": 1.735765175120467, + "language_loss": 0.73597902, + "learning_rate": 1.1996890021228814e-06, + "loss": 0.75809145, + "num_input_tokens_seen": 114949475, + "step": 5340, + "time_per_iteration": 2.718479633331299 + }, + { + "auxiliary_loss_clip": 0.01118465, + "auxiliary_loss_mlp": 0.01084493, + "balance_loss_clip": 1.02467763, + "balance_loss_mlp": 1.00433707, + "epoch": 0.6422172789033849, + "flos": 40406147458560.0, + "grad_norm": 1.6271190420123514, + "language_loss": 0.70112967, + "learning_rate": 1.1989751764139785e-06, + "loss": 0.72315925, + "num_input_tokens_seen": 114973125, + "step": 5341, + "time_per_iteration": 2.8357763290405273 + }, + { + "auxiliary_loss_clip": 0.0109999, + "auxiliary_loss_mlp": 0.01085256, + "balance_loss_clip": 1.02296805, + "balance_loss_mlp": 1.00505185, + "epoch": 0.6423375217940239, + "flos": 27672870637440.0, + "grad_norm": 1.648509472138223, + "language_loss": 0.82924509, + "learning_rate": 1.1982614722199044e-06, + "loss": 0.85109758, + "num_input_tokens_seen": 114994300, + "step": 5342, + "time_per_iteration": 2.871861219406128 + }, + { + "auxiliary_loss_clip": 0.01118325, + "auxiliary_loss_mlp": 0.01084317, + "balance_loss_clip": 1.0244627, + "balance_loss_mlp": 1.00416064, + "epoch": 0.642457764684663, + "flos": 18369242259840.0, + "grad_norm": 1.963151253545524, + "language_loss": 0.77766836, + "learning_rate": 1.1975478896489276e-06, + "loss": 0.79969484, + "num_input_tokens_seen": 115012135, + "step": 5343, + "time_per_iteration": 2.6896722316741943 + }, + { + "auxiliary_loss_clip": 0.01133941, + "auxiliary_loss_mlp": 0.01084179, + "balance_loss_clip": 1.02463627, + "balance_loss_mlp": 1.00397515, + "epoch": 0.6425780075753021, + "flos": 19750509809280.0, + "grad_norm": 1.8545495342634641, + "language_loss": 0.76399595, + "learning_rate": 1.1968344288092981e-06, + "loss": 0.78617716, + "num_input_tokens_seen": 115028715, + "step": 5344, + "time_per_iteration": 2.596052408218384 + }, + { + "auxiliary_loss_clip": 0.01126031, + "auxiliary_loss_mlp": 0.00872811, + "balance_loss_clip": 1.02466369, + "balance_loss_mlp": 1.00010753, + "epoch": 0.6426982504659412, + "flos": 20558536208640.0, + "grad_norm": 1.7175356399838686, + "language_loss": 0.64814103, + "learning_rate": 1.1961210898092468e-06, + "loss": 0.66812944, + "num_input_tokens_seen": 115047665, + "step": 5345, + "time_per_iteration": 2.7226579189300537 + }, + { + "auxiliary_loss_clip": 0.0111792, + "auxiliary_loss_mlp": 0.01084794, + "balance_loss_clip": 1.02440846, + "balance_loss_mlp": 1.00463784, + "epoch": 0.6428184933565803, + "flos": 17851984456320.0, + "grad_norm": 2.0619588815467758, + "language_loss": 0.79150057, + "learning_rate": 1.1954078727569874e-06, + "loss": 0.8135277, + "num_input_tokens_seen": 115064965, + "step": 5346, + "time_per_iteration": 2.729881763458252 + }, + { + "auxiliary_loss_clip": 0.01108383, + "auxiliary_loss_mlp": 0.00872839, + "balance_loss_clip": 1.02426219, + "balance_loss_mlp": 1.00007391, + "epoch": 0.6429387362472194, + "flos": 22456953820800.0, + "grad_norm": 1.5312368615677001, + "language_loss": 0.78189808, + "learning_rate": 1.1946947777607141e-06, + "loss": 0.80171031, + "num_input_tokens_seen": 115086100, + "step": 5347, + "time_per_iteration": 2.7973430156707764 + }, + { + "auxiliary_loss_clip": 0.01098788, + "auxiliary_loss_mlp": 0.01085157, + "balance_loss_clip": 1.0230968, + "balance_loss_mlp": 1.00485742, + "epoch": 0.6430589791378585, + "flos": 24752579005440.0, + "grad_norm": 1.8832380701735556, + "language_loss": 0.80437672, + "learning_rate": 1.1939818049286024e-06, + "loss": 0.8262161, + "num_input_tokens_seen": 115104260, + "step": 5348, + "time_per_iteration": 2.8048019409179688 + }, + { + "auxiliary_loss_clip": 0.01082187, + "auxiliary_loss_mlp": 0.01085637, + "balance_loss_clip": 1.02189302, + "balance_loss_mlp": 1.0053854, + "epoch": 0.6431792220284975, + "flos": 24901249397760.0, + "grad_norm": 1.6310822553132993, + "language_loss": 0.75758445, + "learning_rate": 1.1932689543688101e-06, + "loss": 0.77926266, + "num_input_tokens_seen": 115125365, + "step": 5349, + "time_per_iteration": 2.8680927753448486 + }, + { + "auxiliary_loss_clip": 0.011162, + "auxiliary_loss_mlp": 0.01085493, + "balance_loss_clip": 1.02442586, + "balance_loss_mlp": 1.00519371, + "epoch": 0.6432994649191367, + "flos": 21032305620480.0, + "grad_norm": 1.662364480945927, + "language_loss": 0.72652948, + "learning_rate": 1.1925562261894756e-06, + "loss": 0.74854636, + "num_input_tokens_seen": 115144445, + "step": 5350, + "time_per_iteration": 2.6285388469696045 + }, + { + "auxiliary_loss_clip": 0.01115578, + "auxiliary_loss_mlp": 0.01083674, + "balance_loss_clip": 1.02344441, + "balance_loss_mlp": 1.00347018, + "epoch": 0.6434197078097758, + "flos": 30884433655680.0, + "grad_norm": 1.9180448070760319, + "language_loss": 0.7722497, + "learning_rate": 1.1918436204987207e-06, + "loss": 0.79424226, + "num_input_tokens_seen": 115166305, + "step": 5351, + "time_per_iteration": 2.821139097213745 + }, + { + "auxiliary_loss_clip": 0.01120336, + "auxiliary_loss_mlp": 0.01085037, + "balance_loss_clip": 1.02176213, + "balance_loss_mlp": 1.00478554, + "epoch": 0.6435399507004148, + "flos": 15012492468480.0, + "grad_norm": 2.1158540758802333, + "language_loss": 0.82310236, + "learning_rate": 1.191131137404645e-06, + "loss": 0.84515607, + "num_input_tokens_seen": 115183045, + "step": 5352, + "time_per_iteration": 3.467459201812744 + }, + { + "auxiliary_loss_clip": 0.01110811, + "auxiliary_loss_mlp": 0.01085298, + "balance_loss_clip": 1.02551627, + "balance_loss_mlp": 1.00499892, + "epoch": 0.643660193591054, + "flos": 19901981462400.0, + "grad_norm": 2.0914366213801467, + "language_loss": 0.77331156, + "learning_rate": 1.190418777015333e-06, + "loss": 0.79527265, + "num_input_tokens_seen": 115201955, + "step": 5353, + "time_per_iteration": 2.7618002891540527 + }, + { + "auxiliary_loss_clip": 0.01117227, + "auxiliary_loss_mlp": 0.01083342, + "balance_loss_clip": 1.02412629, + "balance_loss_mlp": 1.00318539, + "epoch": 0.643780436481693, + "flos": 24133622820480.0, + "grad_norm": 1.500380098799834, + "language_loss": 0.73602879, + "learning_rate": 1.1897065394388487e-06, + "loss": 0.75803447, + "num_input_tokens_seen": 115222395, + "step": 5354, + "time_per_iteration": 2.7484307289123535 + }, + { + "auxiliary_loss_clip": 0.01117381, + "auxiliary_loss_mlp": 0.01086504, + "balance_loss_clip": 1.02601266, + "balance_loss_mlp": 1.00625253, + "epoch": 0.6439006793723321, + "flos": 23148808657920.0, + "grad_norm": 1.6716649828124825, + "language_loss": 0.76623368, + "learning_rate": 1.1889944247832385e-06, + "loss": 0.78827256, + "num_input_tokens_seen": 115242635, + "step": 5355, + "time_per_iteration": 2.71279239654541 + }, + { + "auxiliary_loss_clip": 0.01125988, + "auxiliary_loss_mlp": 0.01084761, + "balance_loss_clip": 1.02428865, + "balance_loss_mlp": 1.00450933, + "epoch": 0.6440209222629713, + "flos": 23617909301760.0, + "grad_norm": 1.78516561087783, + "language_loss": 0.7083202, + "learning_rate": 1.1882824331565283e-06, + "loss": 0.73042774, + "num_input_tokens_seen": 115262095, + "step": 5356, + "time_per_iteration": 3.612994432449341 + }, + { + "auxiliary_loss_clip": 0.01110987, + "auxiliary_loss_mlp": 0.01084321, + "balance_loss_clip": 1.02523088, + "balance_loss_mlp": 1.00406933, + "epoch": 0.6441411651536103, + "flos": 16544872535040.0, + "grad_norm": 2.155382583060003, + "language_loss": 0.89019597, + "learning_rate": 1.1875705646667287e-06, + "loss": 0.91214901, + "num_input_tokens_seen": 115279985, + "step": 5357, + "time_per_iteration": 4.124113321304321 + }, + { + "auxiliary_loss_clip": 0.01125896, + "auxiliary_loss_mlp": 0.01084295, + "balance_loss_clip": 1.02411187, + "balance_loss_mlp": 1.00399566, + "epoch": 0.6442614080442494, + "flos": 25410965345280.0, + "grad_norm": 1.8924651037442475, + "language_loss": 0.75457931, + "learning_rate": 1.1868588194218282e-06, + "loss": 0.77668118, + "num_input_tokens_seen": 115300365, + "step": 5358, + "time_per_iteration": 3.661365509033203 + }, + { + "auxiliary_loss_clip": 0.01118648, + "auxiliary_loss_mlp": 0.01085485, + "balance_loss_clip": 1.02463806, + "balance_loss_mlp": 1.00532889, + "epoch": 0.6443816509348885, + "flos": 28294017552000.0, + "grad_norm": 1.4861946489670006, + "language_loss": 0.74099481, + "learning_rate": 1.1861471975297979e-06, + "loss": 0.76303613, + "num_input_tokens_seen": 115322060, + "step": 5359, + "time_per_iteration": 2.7731456756591797 + }, + { + "auxiliary_loss_clip": 0.01098164, + "auxiliary_loss_mlp": 0.0108543, + "balance_loss_clip": 1.02296066, + "balance_loss_mlp": 1.00503576, + "epoch": 0.6445018938255276, + "flos": 36690075964800.0, + "grad_norm": 1.6187745102366689, + "language_loss": 0.70864135, + "learning_rate": 1.185435699098591e-06, + "loss": 0.73047721, + "num_input_tokens_seen": 115348255, + "step": 5360, + "time_per_iteration": 2.9701223373413086 + }, + { + "auxiliary_loss_clip": 0.01119098, + "auxiliary_loss_mlp": 0.01085718, + "balance_loss_clip": 1.02574229, + "balance_loss_mlp": 1.00537109, + "epoch": 0.6446221367161666, + "flos": 14501411804160.0, + "grad_norm": 2.34888605480162, + "language_loss": 0.77852136, + "learning_rate": 1.1847243242361403e-06, + "loss": 0.80056953, + "num_input_tokens_seen": 115366845, + "step": 5361, + "time_per_iteration": 2.7215044498443604 + }, + { + "auxiliary_loss_clip": 0.01112406, + "auxiliary_loss_mlp": 0.01084115, + "balance_loss_clip": 1.02139258, + "balance_loss_mlp": 1.00391126, + "epoch": 0.6447423796068057, + "flos": 24609367480320.0, + "grad_norm": 1.4910479494557112, + "language_loss": 0.78014928, + "learning_rate": 1.1840130730503624e-06, + "loss": 0.80211449, + "num_input_tokens_seen": 115388125, + "step": 5362, + "time_per_iteration": 2.691789150238037 + }, + { + "auxiliary_loss_clip": 0.01136203, + "auxiliary_loss_mlp": 0.01085607, + "balance_loss_clip": 1.026281, + "balance_loss_mlp": 1.0054028, + "epoch": 0.6448626224974449, + "flos": 25047298097280.0, + "grad_norm": 1.5814946938200882, + "language_loss": 0.74954891, + "learning_rate": 1.1833019456491518e-06, + "loss": 0.77176702, + "num_input_tokens_seen": 115409655, + "step": 5363, + "time_per_iteration": 2.6338272094726562 + }, + { + "auxiliary_loss_clip": 0.01124969, + "auxiliary_loss_mlp": 0.01084735, + "balance_loss_clip": 1.02472687, + "balance_loss_mlp": 1.00448322, + "epoch": 0.6449828653880839, + "flos": 22530355263360.0, + "grad_norm": 1.9601486418708833, + "language_loss": 0.78531301, + "learning_rate": 1.1825909421403871e-06, + "loss": 0.80741006, + "num_input_tokens_seen": 115428750, + "step": 5364, + "time_per_iteration": 2.691657304763794 + }, + { + "auxiliary_loss_clip": 0.0112561, + "auxiliary_loss_mlp": 0.01084032, + "balance_loss_clip": 1.02432775, + "balance_loss_mlp": 1.00387549, + "epoch": 0.645103108278723, + "flos": 25695736369920.0, + "grad_norm": 1.8219111002970296, + "language_loss": 0.76262116, + "learning_rate": 1.181880062631926e-06, + "loss": 0.78471756, + "num_input_tokens_seen": 115448085, + "step": 5365, + "time_per_iteration": 2.6442978382110596 + }, + { + "auxiliary_loss_clip": 0.01119631, + "auxiliary_loss_mlp": 0.01086155, + "balance_loss_clip": 1.02628005, + "balance_loss_mlp": 1.0058552, + "epoch": 0.6452233511693621, + "flos": 27450331925760.0, + "grad_norm": 8.195160091300744, + "language_loss": 0.84374571, + "learning_rate": 1.1811693072316093e-06, + "loss": 0.8658036, + "num_input_tokens_seen": 115465765, + "step": 5366, + "time_per_iteration": 2.811509132385254 + }, + { + "auxiliary_loss_clip": 0.01135711, + "auxiliary_loss_mlp": 0.00872909, + "balance_loss_clip": 1.0258038, + "balance_loss_mlp": 1.00009537, + "epoch": 0.6453435940600012, + "flos": 19208618254080.0, + "grad_norm": 2.067098956186881, + "language_loss": 0.83840925, + "learning_rate": 1.1804586760472574e-06, + "loss": 0.85849547, + "num_input_tokens_seen": 115482230, + "step": 5367, + "time_per_iteration": 2.587362766265869 + }, + { + "auxiliary_loss_clip": 0.01107759, + "auxiliary_loss_mlp": 0.01084996, + "balance_loss_clip": 1.02386487, + "balance_loss_mlp": 1.00483978, + "epoch": 0.6454638369506402, + "flos": 25737680476800.0, + "grad_norm": 2.3252682647899747, + "language_loss": 0.80016875, + "learning_rate": 1.1797481691866736e-06, + "loss": 0.82209629, + "num_input_tokens_seen": 115499455, + "step": 5368, + "time_per_iteration": 2.752237558364868 + }, + { + "auxiliary_loss_clip": 0.01112141, + "auxiliary_loss_mlp": 0.0108497, + "balance_loss_clip": 1.02127743, + "balance_loss_mlp": 1.00471854, + "epoch": 0.6455840798412794, + "flos": 20989176364800.0, + "grad_norm": 1.910176496728613, + "language_loss": 0.83077592, + "learning_rate": 1.1790377867576393e-06, + "loss": 0.85274702, + "num_input_tokens_seen": 115517205, + "step": 5369, + "time_per_iteration": 2.668733596801758 + }, + { + "auxiliary_loss_clip": 0.01117263, + "auxiliary_loss_mlp": 0.01084427, + "balance_loss_clip": 1.02395439, + "balance_loss_mlp": 1.00422275, + "epoch": 0.6457043227319185, + "flos": 26067556005120.0, + "grad_norm": 1.8876916161552777, + "language_loss": 0.76270884, + "learning_rate": 1.1783275288679203e-06, + "loss": 0.78472579, + "num_input_tokens_seen": 115534370, + "step": 5370, + "time_per_iteration": 3.032576322555542 + }, + { + "auxiliary_loss_clip": 0.01107527, + "auxiliary_loss_mlp": 0.01078684, + "balance_loss_clip": 1.01988196, + "balance_loss_mlp": 0.99972004, + "epoch": 0.6458245656225575, + "flos": 60370831088640.0, + "grad_norm": 1.1616852101545954, + "language_loss": 0.57222527, + "learning_rate": 1.177617395625262e-06, + "loss": 0.59408736, + "num_input_tokens_seen": 115592345, + "step": 5371, + "time_per_iteration": 3.202582836151123 + }, + { + "auxiliary_loss_clip": 0.01126142, + "auxiliary_loss_mlp": 0.01083989, + "balance_loss_clip": 1.02570474, + "balance_loss_mlp": 1.00378513, + "epoch": 0.6459448085131967, + "flos": 23076771932160.0, + "grad_norm": 2.2943607199046765, + "language_loss": 0.75055289, + "learning_rate": 1.1769073871373908e-06, + "loss": 0.77265424, + "num_input_tokens_seen": 115612550, + "step": 5372, + "time_per_iteration": 2.6805226802825928 + }, + { + "auxiliary_loss_clip": 0.01108529, + "auxiliary_loss_mlp": 0.01083845, + "balance_loss_clip": 1.02361619, + "balance_loss_mlp": 1.00364053, + "epoch": 0.6460650514038357, + "flos": 22598190097920.0, + "grad_norm": 1.5686214879900822, + "language_loss": 0.83641016, + "learning_rate": 1.176197503512015e-06, + "loss": 0.85833389, + "num_input_tokens_seen": 115632265, + "step": 5373, + "time_per_iteration": 2.8018105030059814 + }, + { + "auxiliary_loss_clip": 0.01121127, + "auxiliary_loss_mlp": 0.01084076, + "balance_loss_clip": 1.02727723, + "balance_loss_mlp": 1.00396729, + "epoch": 0.6461852942944748, + "flos": 20266726118400.0, + "grad_norm": 2.2211405935789994, + "language_loss": 0.82567179, + "learning_rate": 1.1754877448568223e-06, + "loss": 0.84772384, + "num_input_tokens_seen": 115651720, + "step": 5374, + "time_per_iteration": 2.700428009033203 + }, + { + "auxiliary_loss_clip": 0.01117979, + "auxiliary_loss_mlp": 0.01083596, + "balance_loss_clip": 1.02577496, + "balance_loss_mlp": 1.0034399, + "epoch": 0.646305537185114, + "flos": 23367109564800.0, + "grad_norm": 1.944224274599558, + "language_loss": 0.90057272, + "learning_rate": 1.1747781112794837e-06, + "loss": 0.92258853, + "num_input_tokens_seen": 115668215, + "step": 5375, + "time_per_iteration": 2.7751007080078125 + }, + { + "auxiliary_loss_clip": 0.01111339, + "auxiliary_loss_mlp": 0.01084212, + "balance_loss_clip": 1.02589774, + "balance_loss_mlp": 1.00400758, + "epoch": 0.646425780075753, + "flos": 24277480790400.0, + "grad_norm": 1.7781931024994126, + "language_loss": 0.829355, + "learning_rate": 1.1740686028876487e-06, + "loss": 0.85131049, + "num_input_tokens_seen": 115687080, + "step": 5376, + "time_per_iteration": 2.778207778930664 + }, + { + "auxiliary_loss_clip": 0.01124706, + "auxiliary_loss_mlp": 0.0108423, + "balance_loss_clip": 1.02474976, + "balance_loss_mlp": 1.00397825, + "epoch": 0.6465460229663921, + "flos": 20813968800000.0, + "grad_norm": 2.070437346999438, + "language_loss": 0.74489141, + "learning_rate": 1.1733592197889507e-06, + "loss": 0.76698077, + "num_input_tokens_seen": 115703990, + "step": 5377, + "time_per_iteration": 2.696955919265747 + }, + { + "auxiliary_loss_clip": 0.01119409, + "auxiliary_loss_mlp": 0.01083662, + "balance_loss_clip": 1.02465963, + "balance_loss_mlp": 1.00355339, + "epoch": 0.6466662658570312, + "flos": 22853299466880.0, + "grad_norm": 1.716383204916678, + "language_loss": 0.72386569, + "learning_rate": 1.1726499620910014e-06, + "loss": 0.7458964, + "num_input_tokens_seen": 115724270, + "step": 5378, + "time_per_iteration": 3.6146533489227295 + }, + { + "auxiliary_loss_clip": 0.01124818, + "auxiliary_loss_mlp": 0.01083822, + "balance_loss_clip": 1.02409708, + "balance_loss_mlp": 1.00357032, + "epoch": 0.6467865087476703, + "flos": 15304553953920.0, + "grad_norm": 1.9421857633605228, + "language_loss": 0.77869976, + "learning_rate": 1.1719408299013955e-06, + "loss": 0.80078614, + "num_input_tokens_seen": 115742995, + "step": 5379, + "time_per_iteration": 2.616257905960083 + }, + { + "auxiliary_loss_clip": 0.01136126, + "auxiliary_loss_mlp": 0.01084048, + "balance_loss_clip": 1.02643454, + "balance_loss_mlp": 1.00389218, + "epoch": 0.6469067516383094, + "flos": 19573650218880.0, + "grad_norm": 4.3801256047874375, + "language_loss": 0.75241911, + "learning_rate": 1.1712318233277067e-06, + "loss": 0.77462089, + "num_input_tokens_seen": 115762015, + "step": 5380, + "time_per_iteration": 2.637733221054077 + }, + { + "auxiliary_loss_clip": 0.01106972, + "auxiliary_loss_mlp": 0.01078668, + "balance_loss_clip": 1.01947093, + "balance_loss_mlp": 0.99970388, + "epoch": 0.6470269945289485, + "flos": 65098002522240.0, + "grad_norm": 0.7461029914403778, + "language_loss": 0.57911289, + "learning_rate": 1.1705229424774916e-06, + "loss": 0.60096931, + "num_input_tokens_seen": 115816285, + "step": 5381, + "time_per_iteration": 3.0840113162994385 + }, + { + "auxiliary_loss_clip": 0.01119059, + "auxiliary_loss_mlp": 0.01083743, + "balance_loss_clip": 1.02543783, + "balance_loss_mlp": 1.00349176, + "epoch": 0.6471472374195876, + "flos": 30696943639680.0, + "grad_norm": 1.6997794052504334, + "language_loss": 0.63736928, + "learning_rate": 1.1698141874582867e-06, + "loss": 0.6593973, + "num_input_tokens_seen": 115837330, + "step": 5382, + "time_per_iteration": 4.607290744781494 + }, + { + "auxiliary_loss_clip": 0.01135554, + "auxiliary_loss_mlp": 0.01084432, + "balance_loss_clip": 1.02625442, + "balance_loss_mlp": 1.00422823, + "epoch": 0.6472674803102266, + "flos": 20521835487360.0, + "grad_norm": 1.9662443460768604, + "language_loss": 0.72218472, + "learning_rate": 1.169105558377609e-06, + "loss": 0.74438459, + "num_input_tokens_seen": 115857420, + "step": 5383, + "time_per_iteration": 2.686288833618164 + }, + { + "auxiliary_loss_clip": 0.01075703, + "auxiliary_loss_mlp": 0.00872951, + "balance_loss_clip": 1.02103996, + "balance_loss_mlp": 1.00014222, + "epoch": 0.6473877232008658, + "flos": 24715447320960.0, + "grad_norm": 2.0986518648336587, + "language_loss": 0.78317934, + "learning_rate": 1.1683970553429587e-06, + "loss": 0.80266589, + "num_input_tokens_seen": 115878875, + "step": 5384, + "time_per_iteration": 3.763706922531128 + }, + { + "auxiliary_loss_clip": 0.01107585, + "auxiliary_loss_mlp": 0.01083772, + "balance_loss_clip": 1.02436471, + "balance_loss_mlp": 1.00356793, + "epoch": 0.6475079660915048, + "flos": 15885552441600.0, + "grad_norm": 1.712256967790884, + "language_loss": 0.8174715, + "learning_rate": 1.1676886784618128e-06, + "loss": 0.83938503, + "num_input_tokens_seen": 115895540, + "step": 5385, + "time_per_iteration": 2.6831305027008057 + }, + { + "auxiliary_loss_clip": 0.01126657, + "auxiliary_loss_mlp": 0.01083993, + "balance_loss_clip": 1.0255965, + "balance_loss_mlp": 1.00378859, + "epoch": 0.6476282089821439, + "flos": 17381590922880.0, + "grad_norm": 2.5806905328952703, + "language_loss": 0.84349614, + "learning_rate": 1.1669804278416332e-06, + "loss": 0.86560261, + "num_input_tokens_seen": 115910265, + "step": 5386, + "time_per_iteration": 2.6379594802856445 + }, + { + "auxiliary_loss_clip": 0.01117718, + "auxiliary_loss_mlp": 0.01084493, + "balance_loss_clip": 1.02515793, + "balance_loss_mlp": 1.00419414, + "epoch": 0.6477484518727831, + "flos": 20194078861440.0, + "grad_norm": 1.6892991323704794, + "language_loss": 0.71528029, + "learning_rate": 1.1662723035898602e-06, + "loss": 0.73730242, + "num_input_tokens_seen": 115930025, + "step": 5387, + "time_per_iteration": 2.659677267074585 + }, + { + "auxiliary_loss_clip": 0.01126721, + "auxiliary_loss_mlp": 0.01084663, + "balance_loss_clip": 1.02634168, + "balance_loss_mlp": 1.00426829, + "epoch": 0.6478686947634221, + "flos": 25410426641280.0, + "grad_norm": 1.616474488969202, + "language_loss": 0.81939423, + "learning_rate": 1.165564305813915e-06, + "loss": 0.84150803, + "num_input_tokens_seen": 115949025, + "step": 5388, + "time_per_iteration": 2.7303307056427 + }, + { + "auxiliary_loss_clip": 0.01125928, + "auxiliary_loss_mlp": 0.01083927, + "balance_loss_clip": 1.02472973, + "balance_loss_mlp": 1.00381875, + "epoch": 0.6479889376540612, + "flos": 20083581648000.0, + "grad_norm": 1.6907316029541244, + "language_loss": 0.81135082, + "learning_rate": 1.1648564346212019e-06, + "loss": 0.83344936, + "num_input_tokens_seen": 115968145, + "step": 5389, + "time_per_iteration": 2.652113437652588 + }, + { + "auxiliary_loss_clip": 0.01125646, + "auxiliary_loss_mlp": 0.01083944, + "balance_loss_clip": 1.02522922, + "balance_loss_mlp": 1.00393033, + "epoch": 0.6481091805447003, + "flos": 26758082039040.0, + "grad_norm": 1.6623902485215438, + "language_loss": 0.75847781, + "learning_rate": 1.164148690119104e-06, + "loss": 0.78057361, + "num_input_tokens_seen": 115989425, + "step": 5390, + "time_per_iteration": 2.719562530517578 + }, + { + "auxiliary_loss_clip": 0.01135209, + "auxiliary_loss_mlp": 0.01084486, + "balance_loss_clip": 1.02549279, + "balance_loss_mlp": 1.00437748, + "epoch": 0.6482294234353394, + "flos": 23952094462080.0, + "grad_norm": 1.671477993775612, + "language_loss": 0.7399987, + "learning_rate": 1.163441072414985e-06, + "loss": 0.76219559, + "num_input_tokens_seen": 116009630, + "step": 5391, + "time_per_iteration": 2.6255619525909424 + }, + { + "auxiliary_loss_clip": 0.01127381, + "auxiliary_loss_mlp": 0.01084011, + "balance_loss_clip": 1.0267638, + "balance_loss_mlp": 1.00385499, + "epoch": 0.6483496663259785, + "flos": 26209833776640.0, + "grad_norm": 1.7876197734293617, + "language_loss": 0.69781911, + "learning_rate": 1.16273358161619e-06, + "loss": 0.71993303, + "num_input_tokens_seen": 116029965, + "step": 5392, + "time_per_iteration": 2.727060317993164 + }, + { + "auxiliary_loss_clip": 0.01102239, + "auxiliary_loss_mlp": 0.01084073, + "balance_loss_clip": 1.02531672, + "balance_loss_mlp": 1.00391698, + "epoch": 0.6484699092166175, + "flos": 20922239370240.0, + "grad_norm": 1.7620553098981664, + "language_loss": 0.83621556, + "learning_rate": 1.1620262178300446e-06, + "loss": 0.85807872, + "num_input_tokens_seen": 116048580, + "step": 5393, + "time_per_iteration": 2.628298282623291 + }, + { + "auxiliary_loss_clip": 0.01108819, + "auxiliary_loss_mlp": 0.01083625, + "balance_loss_clip": 1.02458191, + "balance_loss_mlp": 1.00342047, + "epoch": 0.6485901521072567, + "flos": 33072865678080.0, + "grad_norm": 1.962300826028412, + "language_loss": 0.76256049, + "learning_rate": 1.1613189811638563e-06, + "loss": 0.78448498, + "num_input_tokens_seen": 116070305, + "step": 5394, + "time_per_iteration": 2.866955518722534 + }, + { + "auxiliary_loss_clip": 0.0112891, + "auxiliary_loss_mlp": 0.01085142, + "balance_loss_clip": 1.02785325, + "balance_loss_mlp": 1.00508106, + "epoch": 0.6487103949978957, + "flos": 22274060745600.0, + "grad_norm": 1.5653671939020644, + "language_loss": 0.78095514, + "learning_rate": 1.1606118717249117e-06, + "loss": 0.8030957, + "num_input_tokens_seen": 116090405, + "step": 5395, + "time_per_iteration": 2.6259634494781494 + }, + { + "auxiliary_loss_clip": 0.01136064, + "auxiliary_loss_mlp": 0.0108476, + "balance_loss_clip": 1.02561092, + "balance_loss_mlp": 1.00446105, + "epoch": 0.6488306378885348, + "flos": 22930400010240.0, + "grad_norm": 1.7106055563306968, + "language_loss": 0.68035305, + "learning_rate": 1.1599048896204787e-06, + "loss": 0.70256126, + "num_input_tokens_seen": 116110285, + "step": 5396, + "time_per_iteration": 2.6036388874053955 + }, + { + "auxiliary_loss_clip": 0.01103478, + "auxiliary_loss_mlp": 0.01085023, + "balance_loss_clip": 1.02034879, + "balance_loss_mlp": 1.00486672, + "epoch": 0.648950880779174, + "flos": 20376110010240.0, + "grad_norm": 1.7570198529644445, + "language_loss": 0.8070237, + "learning_rate": 1.1591980349578061e-06, + "loss": 0.82890868, + "num_input_tokens_seen": 116128955, + "step": 5397, + "time_per_iteration": 2.74674391746521 + }, + { + "auxiliary_loss_clip": 0.01091819, + "auxiliary_loss_mlp": 0.01080473, + "balance_loss_clip": 1.02082419, + "balance_loss_mlp": 1.00150895, + "epoch": 0.649071123669813, + "flos": 59930889310080.0, + "grad_norm": 0.7901077772251114, + "language_loss": 0.54304922, + "learning_rate": 1.158491307844123e-06, + "loss": 0.56477219, + "num_input_tokens_seen": 116188875, + "step": 5398, + "time_per_iteration": 3.2804830074310303 + }, + { + "auxiliary_loss_clip": 0.01118981, + "auxiliary_loss_mlp": 0.01084835, + "balance_loss_clip": 1.026618, + "balance_loss_mlp": 1.00467873, + "epoch": 0.6491913665604521, + "flos": 20446566537600.0, + "grad_norm": 5.8152678757752945, + "language_loss": 0.84210336, + "learning_rate": 1.1577847083866387e-06, + "loss": 0.86414152, + "num_input_tokens_seen": 116207910, + "step": 5399, + "time_per_iteration": 2.6682991981506348 + }, + { + "auxiliary_loss_clip": 0.01117351, + "auxiliary_loss_mlp": 0.01084421, + "balance_loss_clip": 1.02479661, + "balance_loss_mlp": 1.00402641, + "epoch": 0.6493116094510912, + "flos": 16946820702720.0, + "grad_norm": 1.800654705204526, + "language_loss": 0.71918017, + "learning_rate": 1.1570782366925453e-06, + "loss": 0.74119788, + "num_input_tokens_seen": 116226425, + "step": 5400, + "time_per_iteration": 2.6693832874298096 + }, + { + "auxiliary_loss_clip": 0.01117922, + "auxiliary_loss_mlp": 0.01085058, + "balance_loss_clip": 1.0239414, + "balance_loss_mlp": 1.00485444, + "epoch": 0.6494318523417303, + "flos": 18802935072000.0, + "grad_norm": 1.5478552742218115, + "language_loss": 0.75361979, + "learning_rate": 1.1563718928690132e-06, + "loss": 0.77564967, + "num_input_tokens_seen": 116243860, + "step": 5401, + "time_per_iteration": 2.7848920822143555 + }, + { + "auxiliary_loss_clip": 0.01105723, + "auxiliary_loss_mlp": 0.01084387, + "balance_loss_clip": 1.02385652, + "balance_loss_mlp": 1.00418341, + "epoch": 0.6495520952323693, + "flos": 18982847318400.0, + "grad_norm": 2.5066311243109443, + "language_loss": 0.71348941, + "learning_rate": 1.1556656770231942e-06, + "loss": 0.73539048, + "num_input_tokens_seen": 116260055, + "step": 5402, + "time_per_iteration": 3.5451550483703613 + }, + { + "auxiliary_loss_clip": 0.01126578, + "auxiliary_loss_mlp": 0.01083389, + "balance_loss_clip": 1.02493453, + "balance_loss_mlp": 1.00328052, + "epoch": 0.6496723381230085, + "flos": 22745388032640.0, + "grad_norm": 1.4351904469779881, + "language_loss": 0.76328611, + "learning_rate": 1.1549595892622207e-06, + "loss": 0.78538579, + "num_input_tokens_seen": 116278825, + "step": 5403, + "time_per_iteration": 2.711578607559204 + }, + { + "auxiliary_loss_clip": 0.01079346, + "auxiliary_loss_mlp": 0.01080169, + "balance_loss_clip": 1.02640891, + "balance_loss_mlp": 1.00120485, + "epoch": 0.6497925810136476, + "flos": 62145283887360.0, + "grad_norm": 0.8105769229184254, + "language_loss": 0.59110522, + "learning_rate": 1.1542536296932047e-06, + "loss": 0.61270034, + "num_input_tokens_seen": 116342360, + "step": 5404, + "time_per_iteration": 3.321629762649536 + }, + { + "auxiliary_loss_clip": 0.011098, + "auxiliary_loss_mlp": 0.01085551, + "balance_loss_clip": 1.02476621, + "balance_loss_mlp": 1.00529921, + "epoch": 0.6499128239042866, + "flos": 20156731695360.0, + "grad_norm": 8.640790419869004, + "language_loss": 0.69996357, + "learning_rate": 1.1535477984232414e-06, + "loss": 0.72191703, + "num_input_tokens_seen": 116362235, + "step": 5405, + "time_per_iteration": 2.848796844482422 + }, + { + "auxiliary_loss_clip": 0.01084414, + "auxiliary_loss_mlp": 0.01083575, + "balance_loss_clip": 1.02387607, + "balance_loss_mlp": 1.00341916, + "epoch": 0.6500330667949258, + "flos": 24462420940800.0, + "grad_norm": 2.1165271056962296, + "language_loss": 0.76957667, + "learning_rate": 1.152842095559404e-06, + "loss": 0.79125661, + "num_input_tokens_seen": 116382895, + "step": 5406, + "time_per_iteration": 2.8601813316345215 + }, + { + "auxiliary_loss_clip": 0.01117607, + "auxiliary_loss_mlp": 0.01083595, + "balance_loss_clip": 1.02410734, + "balance_loss_mlp": 1.00348651, + "epoch": 0.6501533096855648, + "flos": 25477399549440.0, + "grad_norm": 1.6000712005429307, + "language_loss": 0.76796252, + "learning_rate": 1.1521365212087474e-06, + "loss": 0.78997457, + "num_input_tokens_seen": 116402880, + "step": 5407, + "time_per_iteration": 4.650601387023926 + }, + { + "auxiliary_loss_clip": 0.011249, + "auxiliary_loss_mlp": 0.01083623, + "balance_loss_clip": 1.02431333, + "balance_loss_mlp": 1.00337172, + "epoch": 0.6502735525762039, + "flos": 44819245347840.0, + "grad_norm": 1.6487782996842444, + "language_loss": 0.70478594, + "learning_rate": 1.1514310754783062e-06, + "loss": 0.72687107, + "num_input_tokens_seen": 116425830, + "step": 5408, + "time_per_iteration": 2.836437702178955 + }, + { + "auxiliary_loss_clip": 0.01117623, + "auxiliary_loss_mlp": 0.01084343, + "balance_loss_clip": 1.02523625, + "balance_loss_mlp": 1.00428224, + "epoch": 0.6503937954668431, + "flos": 28658546726400.0, + "grad_norm": 1.8678550558840128, + "language_loss": 0.73070663, + "learning_rate": 1.1507257584750964e-06, + "loss": 0.75272632, + "num_input_tokens_seen": 116446010, + "step": 5409, + "time_per_iteration": 3.666938066482544 + }, + { + "auxiliary_loss_clip": 0.01137025, + "auxiliary_loss_mlp": 0.01083466, + "balance_loss_clip": 1.02707875, + "balance_loss_mlp": 1.00326169, + "epoch": 0.6505140383574821, + "flos": 20922562592640.0, + "grad_norm": 1.715986074428282, + "language_loss": 0.77263021, + "learning_rate": 1.150020570306113e-06, + "loss": 0.79483509, + "num_input_tokens_seen": 116465150, + "step": 5410, + "time_per_iteration": 2.624685287475586 + }, + { + "auxiliary_loss_clip": 0.01118978, + "auxiliary_loss_mlp": 0.01084553, + "balance_loss_clip": 1.0247525, + "balance_loss_mlp": 1.00430131, + "epoch": 0.6506342812481212, + "flos": 20595236929920.0, + "grad_norm": 1.6722699988287328, + "language_loss": 0.747913, + "learning_rate": 1.1493155110783338e-06, + "loss": 0.76994836, + "num_input_tokens_seen": 116483675, + "step": 5411, + "time_per_iteration": 2.781642198562622 + }, + { + "auxiliary_loss_clip": 0.01125468, + "auxiliary_loss_mlp": 0.01084923, + "balance_loss_clip": 1.02484119, + "balance_loss_mlp": 1.00467086, + "epoch": 0.6507545241387603, + "flos": 30226478279040.0, + "grad_norm": 1.8675151469860745, + "language_loss": 0.7096417, + "learning_rate": 1.1486105808987155e-06, + "loss": 0.73174566, + "num_input_tokens_seen": 116505165, + "step": 5412, + "time_per_iteration": 2.7381176948547363 + }, + { + "auxiliary_loss_clip": 0.01125432, + "auxiliary_loss_mlp": 0.01084395, + "balance_loss_clip": 1.02525353, + "balance_loss_mlp": 1.00419044, + "epoch": 0.6508747670293994, + "flos": 17128241320320.0, + "grad_norm": 1.7418887447004874, + "language_loss": 0.81320816, + "learning_rate": 1.1479057798741947e-06, + "loss": 0.83530647, + "num_input_tokens_seen": 116523220, + "step": 5413, + "time_per_iteration": 2.5958635807037354 + }, + { + "auxiliary_loss_clip": 0.0109037, + "auxiliary_loss_mlp": 0.01080348, + "balance_loss_clip": 1.01133561, + "balance_loss_mlp": 1.00138402, + "epoch": 0.6509950099200384, + "flos": 68559826573440.0, + "grad_norm": 0.7808860263702239, + "language_loss": 0.53362274, + "learning_rate": 1.14720110811169e-06, + "loss": 0.55532992, + "num_input_tokens_seen": 116580450, + "step": 5414, + "time_per_iteration": 3.2565581798553467 + }, + { + "auxiliary_loss_clip": 0.01127033, + "auxiliary_loss_mlp": 0.01084377, + "balance_loss_clip": 1.02546501, + "balance_loss_mlp": 1.00412512, + "epoch": 0.6511152528106776, + "flos": 22347462188160.0, + "grad_norm": 1.759410436082081, + "language_loss": 0.76847357, + "learning_rate": 1.146496565718098e-06, + "loss": 0.79058766, + "num_input_tokens_seen": 116601020, + "step": 5415, + "time_per_iteration": 2.801450490951538 + }, + { + "auxiliary_loss_clip": 0.01111744, + "auxiliary_loss_mlp": 0.0108476, + "balance_loss_clip": 1.0217669, + "balance_loss_mlp": 1.0045557, + "epoch": 0.6512354957013167, + "flos": 20522158709760.0, + "grad_norm": 2.202000497172328, + "language_loss": 0.75712276, + "learning_rate": 1.1457921528002996e-06, + "loss": 0.77908778, + "num_input_tokens_seen": 116619455, + "step": 5416, + "time_per_iteration": 2.7875499725341797 + }, + { + "auxiliary_loss_clip": 0.01136088, + "auxiliary_loss_mlp": 0.00873028, + "balance_loss_clip": 1.02597046, + "balance_loss_mlp": 1.00014567, + "epoch": 0.6513557385919557, + "flos": 32337342881280.0, + "grad_norm": 3.37880630289937, + "language_loss": 0.72328818, + "learning_rate": 1.1450878694651522e-06, + "loss": 0.74337929, + "num_input_tokens_seen": 116640020, + "step": 5417, + "time_per_iteration": 2.7624385356903076 + }, + { + "auxiliary_loss_clip": 0.01098181, + "auxiliary_loss_mlp": 0.01084396, + "balance_loss_clip": 1.02245438, + "balance_loss_mlp": 1.0042398, + "epoch": 0.6514759814825949, + "flos": 12093206417280.0, + "grad_norm": 2.321351703249415, + "language_loss": 0.62607485, + "learning_rate": 1.1443837158194954e-06, + "loss": 0.64790064, + "num_input_tokens_seen": 116655165, + "step": 5418, + "time_per_iteration": 2.7416419982910156 + }, + { + "auxiliary_loss_clip": 0.01101567, + "auxiliary_loss_mlp": 0.0108723, + "balance_loss_clip": 1.01968884, + "balance_loss_mlp": 1.00693011, + "epoch": 0.651596224373234, + "flos": 22526907557760.0, + "grad_norm": 1.4603136059850395, + "language_loss": 0.74377799, + "learning_rate": 1.1436796919701484e-06, + "loss": 0.76566589, + "num_input_tokens_seen": 116673880, + "step": 5419, + "time_per_iteration": 2.787914276123047 + }, + { + "auxiliary_loss_clip": 0.01111134, + "auxiliary_loss_mlp": 0.01083855, + "balance_loss_clip": 1.02064347, + "balance_loss_mlp": 1.00355554, + "epoch": 0.651716467263873, + "flos": 27818955250560.0, + "grad_norm": 1.6816640761691468, + "language_loss": 0.61610866, + "learning_rate": 1.1429757980239115e-06, + "loss": 0.63805848, + "num_input_tokens_seen": 116694305, + "step": 5420, + "time_per_iteration": 2.7713658809661865 + }, + { + "auxiliary_loss_clip": 0.01135474, + "auxiliary_loss_mlp": 0.01085061, + "balance_loss_clip": 1.02534199, + "balance_loss_mlp": 1.00466609, + "epoch": 0.6518367101545122, + "flos": 24316300414080.0, + "grad_norm": 4.442181403258858, + "language_loss": 0.82152992, + "learning_rate": 1.1422720340875636e-06, + "loss": 0.84373528, + "num_input_tokens_seen": 116713055, + "step": 5421, + "time_per_iteration": 2.6806349754333496 + }, + { + "auxiliary_loss_clip": 0.01128699, + "auxiliary_loss_mlp": 0.01084119, + "balance_loss_clip": 1.02681577, + "balance_loss_mlp": 1.00391543, + "epoch": 0.6519569530451512, + "flos": 20011939971840.0, + "grad_norm": 2.160385419791928, + "language_loss": 0.79053181, + "learning_rate": 1.1415684002678671e-06, + "loss": 0.81266004, + "num_input_tokens_seen": 116731815, + "step": 5422, + "time_per_iteration": 2.628230333328247 + }, + { + "auxiliary_loss_clip": 0.01119935, + "auxiliary_loss_mlp": 0.01085247, + "balance_loss_clip": 1.02608371, + "balance_loss_mlp": 1.00485229, + "epoch": 0.6520771959357903, + "flos": 21576064682880.0, + "grad_norm": 2.155797974894084, + "language_loss": 0.77824759, + "learning_rate": 1.1408648966715617e-06, + "loss": 0.80029941, + "num_input_tokens_seen": 116749335, + "step": 5423, + "time_per_iteration": 2.74477481842041 + }, + { + "auxiliary_loss_clip": 0.01120453, + "auxiliary_loss_mlp": 0.01084772, + "balance_loss_clip": 1.02550125, + "balance_loss_mlp": 1.00437713, + "epoch": 0.6521974388264293, + "flos": 22711021695360.0, + "grad_norm": 1.7048438681788274, + "language_loss": 0.72897482, + "learning_rate": 1.1401615234053683e-06, + "loss": 0.75102705, + "num_input_tokens_seen": 116768155, + "step": 5424, + "time_per_iteration": 2.693920850753784 + }, + { + "auxiliary_loss_clip": 0.0111706, + "auxiliary_loss_mlp": 0.01084771, + "balance_loss_clip": 1.02444828, + "balance_loss_mlp": 1.00447202, + "epoch": 0.6523176817170685, + "flos": 23002939526400.0, + "grad_norm": 1.7331189824948827, + "language_loss": 0.75848937, + "learning_rate": 1.1394582805759885e-06, + "loss": 0.78050768, + "num_input_tokens_seen": 116787435, + "step": 5425, + "time_per_iteration": 2.733189821243286 + }, + { + "auxiliary_loss_clip": 0.01125438, + "auxiliary_loss_mlp": 0.01084847, + "balance_loss_clip": 1.0252192, + "balance_loss_mlp": 1.00464272, + "epoch": 0.6524379246077076, + "flos": 21688249835520.0, + "grad_norm": 3.106933230765229, + "language_loss": 0.75745296, + "learning_rate": 1.1387551682901022e-06, + "loss": 0.7795558, + "num_input_tokens_seen": 116808040, + "step": 5426, + "time_per_iteration": 2.641606330871582 + }, + { + "auxiliary_loss_clip": 0.01105765, + "auxiliary_loss_mlp": 0.01085494, + "balance_loss_clip": 1.02272749, + "balance_loss_mlp": 1.00524187, + "epoch": 0.6525581674983466, + "flos": 19390936711680.0, + "grad_norm": 1.8431713066840345, + "language_loss": 0.70893025, + "learning_rate": 1.138052186654373e-06, + "loss": 0.73084283, + "num_input_tokens_seen": 116825510, + "step": 5427, + "time_per_iteration": 2.7662878036499023 + }, + { + "auxiliary_loss_clip": 0.01120456, + "auxiliary_loss_mlp": 0.01084499, + "balance_loss_clip": 1.02581096, + "balance_loss_mlp": 1.00419927, + "epoch": 0.6526784103889858, + "flos": 17165444832000.0, + "grad_norm": 1.9396002618506887, + "language_loss": 0.87936962, + "learning_rate": 1.1373493357754417e-06, + "loss": 0.90141922, + "num_input_tokens_seen": 116844415, + "step": 5428, + "time_per_iteration": 3.50467848777771 + }, + { + "auxiliary_loss_clip": 0.01135753, + "auxiliary_loss_mlp": 0.01084047, + "balance_loss_clip": 1.02551591, + "balance_loss_mlp": 1.00393844, + "epoch": 0.6527986532796248, + "flos": 18989168112000.0, + "grad_norm": 1.6748868546762452, + "language_loss": 0.77362669, + "learning_rate": 1.1366466157599303e-06, + "loss": 0.79582477, + "num_input_tokens_seen": 116863690, + "step": 5429, + "time_per_iteration": 2.611712694168091 + }, + { + "auxiliary_loss_clip": 0.01099169, + "auxiliary_loss_mlp": 0.00872878, + "balance_loss_clip": 1.02404737, + "balance_loss_mlp": 1.00015092, + "epoch": 0.6529188961702639, + "flos": 14238581011200.0, + "grad_norm": 2.1005008266607974, + "language_loss": 0.76459908, + "learning_rate": 1.1359440267144412e-06, + "loss": 0.78431958, + "num_input_tokens_seen": 116881145, + "step": 5430, + "time_per_iteration": 2.804612874984741 + }, + { + "auxiliary_loss_clip": 0.01128519, + "auxiliary_loss_mlp": 0.01084113, + "balance_loss_clip": 1.02664864, + "balance_loss_mlp": 1.00390947, + "epoch": 0.653039139060903, + "flos": 36682929158400.0, + "grad_norm": 1.7488867113765352, + "language_loss": 0.74286878, + "learning_rate": 1.1352415687455556e-06, + "loss": 0.76499516, + "num_input_tokens_seen": 116902405, + "step": 5431, + "time_per_iteration": 2.7633471488952637 + }, + { + "auxiliary_loss_clip": 0.01127193, + "auxiliary_loss_mlp": 0.01084953, + "balance_loss_clip": 1.02704155, + "balance_loss_mlp": 1.00465381, + "epoch": 0.6531593819515421, + "flos": 25376275785600.0, + "grad_norm": 2.8172834928456947, + "language_loss": 0.63570285, + "learning_rate": 1.1345392419598362e-06, + "loss": 0.65782428, + "num_input_tokens_seen": 116921285, + "step": 5432, + "time_per_iteration": 2.7451162338256836 + }, + { + "auxiliary_loss_clip": 0.01126653, + "auxiliary_loss_mlp": 0.01084352, + "balance_loss_clip": 1.02494073, + "balance_loss_mlp": 1.00410032, + "epoch": 0.6532796248421812, + "flos": 21178533888000.0, + "grad_norm": 1.6572465539654455, + "language_loss": 0.72033906, + "learning_rate": 1.1338370464638263e-06, + "loss": 0.7424491, + "num_input_tokens_seen": 116940685, + "step": 5433, + "time_per_iteration": 4.539402484893799 + }, + { + "auxiliary_loss_clip": 0.01135442, + "auxiliary_loss_mlp": 0.01084556, + "balance_loss_clip": 1.02570653, + "balance_loss_mlp": 1.00435209, + "epoch": 0.6533998677328203, + "flos": 17675950878720.0, + "grad_norm": 2.3463280278401224, + "language_loss": 0.63515711, + "learning_rate": 1.1331349823640474e-06, + "loss": 0.6573571, + "num_input_tokens_seen": 116958115, + "step": 5434, + "time_per_iteration": 2.7094202041625977 + }, + { + "auxiliary_loss_clip": 0.01129225, + "auxiliary_loss_mlp": 0.00872714, + "balance_loss_clip": 1.02772999, + "balance_loss_mlp": 1.0002352, + "epoch": 0.6535201106234594, + "flos": 28400384701440.0, + "grad_norm": 2.124177521269645, + "language_loss": 0.77927673, + "learning_rate": 1.132433049767003e-06, + "loss": 0.79929614, + "num_input_tokens_seen": 116976030, + "step": 5435, + "time_per_iteration": 3.583753824234009 + }, + { + "auxiliary_loss_clip": 0.01116157, + "auxiliary_loss_mlp": 0.01083858, + "balance_loss_clip": 1.02479506, + "balance_loss_mlp": 1.00374937, + "epoch": 0.6536403535140984, + "flos": 23586667447680.0, + "grad_norm": 1.54328189635784, + "language_loss": 0.81344306, + "learning_rate": 1.1317312487791748e-06, + "loss": 0.8354432, + "num_input_tokens_seen": 116997680, + "step": 5436, + "time_per_iteration": 2.7893238067626953 + }, + { + "auxiliary_loss_clip": 0.01127642, + "auxiliary_loss_mlp": 0.01084064, + "balance_loss_clip": 1.02572775, + "balance_loss_mlp": 1.00390744, + "epoch": 0.6537605964047376, + "flos": 21579476474880.0, + "grad_norm": 1.7708864823835346, + "language_loss": 0.72753024, + "learning_rate": 1.1310295795070253e-06, + "loss": 0.74964732, + "num_input_tokens_seen": 117017620, + "step": 5437, + "time_per_iteration": 2.6736106872558594 + }, + { + "auxiliary_loss_clip": 0.01085899, + "auxiliary_loss_mlp": 0.01084358, + "balance_loss_clip": 1.02473032, + "balance_loss_mlp": 1.00424993, + "epoch": 0.6538808392953767, + "flos": 26833997433600.0, + "grad_norm": 1.6733642590294657, + "language_loss": 0.80787522, + "learning_rate": 1.1303280420569982e-06, + "loss": 0.82957774, + "num_input_tokens_seen": 117039505, + "step": 5438, + "time_per_iteration": 2.823657989501953 + }, + { + "auxiliary_loss_clip": 0.01128863, + "auxiliary_loss_mlp": 0.01084406, + "balance_loss_clip": 1.02702212, + "balance_loss_mlp": 1.00425017, + "epoch": 0.6540010821860157, + "flos": 30738241301760.0, + "grad_norm": 1.8599582695835342, + "language_loss": 0.77459514, + "learning_rate": 1.1296266365355158e-06, + "loss": 0.79672778, + "num_input_tokens_seen": 117062890, + "step": 5439, + "time_per_iteration": 2.7528488636016846 + }, + { + "auxiliary_loss_clip": 0.01106185, + "auxiliary_loss_mlp": 0.01084601, + "balance_loss_clip": 1.02319098, + "balance_loss_mlp": 1.00430155, + "epoch": 0.6541213250766549, + "flos": 26907147480960.0, + "grad_norm": 1.778970008727783, + "language_loss": 0.73809153, + "learning_rate": 1.1289253630489806e-06, + "loss": 0.75999939, + "num_input_tokens_seen": 117083940, + "step": 5440, + "time_per_iteration": 2.783295154571533 + }, + { + "auxiliary_loss_clip": 0.01129272, + "auxiliary_loss_mlp": 0.01085139, + "balance_loss_clip": 1.02661276, + "balance_loss_mlp": 1.00479209, + "epoch": 0.6542415679672939, + "flos": 19172384409600.0, + "grad_norm": 1.870277157141737, + "language_loss": 0.72556055, + "learning_rate": 1.1282242217037753e-06, + "loss": 0.74770463, + "num_input_tokens_seen": 117101440, + "step": 5441, + "time_per_iteration": 2.6228597164154053 + }, + { + "auxiliary_loss_clip": 0.01102448, + "auxiliary_loss_mlp": 0.01084625, + "balance_loss_clip": 1.02505469, + "balance_loss_mlp": 1.00427747, + "epoch": 0.654361810857933, + "flos": 48173517100800.0, + "grad_norm": 1.7827905501534176, + "language_loss": 0.61679059, + "learning_rate": 1.127523212606262e-06, + "loss": 0.63866127, + "num_input_tokens_seen": 117124265, + "step": 5442, + "time_per_iteration": 3.0027995109558105 + }, + { + "auxiliary_loss_clip": 0.0112479, + "auxiliary_loss_mlp": 0.01083976, + "balance_loss_clip": 1.0240413, + "balance_loss_mlp": 1.00381923, + "epoch": 0.6544820537485722, + "flos": 26943165843840.0, + "grad_norm": 1.534955326657925, + "language_loss": 0.73052043, + "learning_rate": 1.1268223358627835e-06, + "loss": 0.75260806, + "num_input_tokens_seen": 117146755, + "step": 5443, + "time_per_iteration": 2.697582960128784 + }, + { + "auxiliary_loss_clip": 0.01136626, + "auxiliary_loss_mlp": 0.01085203, + "balance_loss_clip": 1.02665401, + "balance_loss_mlp": 1.00485611, + "epoch": 0.6546022966392112, + "flos": 20886328748160.0, + "grad_norm": 1.6565073742977225, + "language_loss": 0.718063, + "learning_rate": 1.126121591579663e-06, + "loss": 0.74028134, + "num_input_tokens_seen": 117165960, + "step": 5444, + "time_per_iteration": 2.6440346240997314 + }, + { + "auxiliary_loss_clip": 0.01126646, + "auxiliary_loss_mlp": 0.01083482, + "balance_loss_clip": 1.02622938, + "balance_loss_mlp": 1.00342095, + "epoch": 0.6547225395298503, + "flos": 24936693143040.0, + "grad_norm": 1.546044973714826, + "language_loss": 0.69151682, + "learning_rate": 1.1254209798632018e-06, + "loss": 0.7136181, + "num_input_tokens_seen": 117186980, + "step": 5445, + "time_per_iteration": 2.7219417095184326 + }, + { + "auxiliary_loss_clip": 0.01084028, + "auxiliary_loss_mlp": 0.01084484, + "balance_loss_clip": 1.02279949, + "balance_loss_mlp": 1.00432765, + "epoch": 0.6548427824204894, + "flos": 22565942663040.0, + "grad_norm": 1.5939522661012229, + "language_loss": 0.84589404, + "learning_rate": 1.124720500819683e-06, + "loss": 0.86757916, + "num_input_tokens_seen": 117205135, + "step": 5446, + "time_per_iteration": 2.8483502864837646 + }, + { + "auxiliary_loss_clip": 0.01135769, + "auxiliary_loss_mlp": 0.01084509, + "balance_loss_clip": 1.02594924, + "balance_loss_mlp": 1.00425732, + "epoch": 0.6549630253111285, + "flos": 18442500048000.0, + "grad_norm": 1.9860997163927376, + "language_loss": 0.82128417, + "learning_rate": 1.1240201545553682e-06, + "loss": 0.84348691, + "num_input_tokens_seen": 117222935, + "step": 5447, + "time_per_iteration": 2.6336872577667236 + }, + { + "auxiliary_loss_clip": 0.01107498, + "auxiliary_loss_mlp": 0.01084917, + "balance_loss_clip": 1.02444756, + "balance_loss_mlp": 1.0047133, + "epoch": 0.6550832682017675, + "flos": 25187313312000.0, + "grad_norm": 1.657714214853162, + "language_loss": 0.73115611, + "learning_rate": 1.1233199411764987e-06, + "loss": 0.75308025, + "num_input_tokens_seen": 117242370, + "step": 5448, + "time_per_iteration": 2.7627148628234863 + }, + { + "auxiliary_loss_clip": 0.01109118, + "auxiliary_loss_mlp": 0.01083395, + "balance_loss_clip": 1.02475405, + "balance_loss_mlp": 1.00319135, + "epoch": 0.6552035110924067, + "flos": 22748153379840.0, + "grad_norm": 1.6389239277858054, + "language_loss": 0.6904155, + "learning_rate": 1.1226198607892978e-06, + "loss": 0.71234071, + "num_input_tokens_seen": 117262930, + "step": 5449, + "time_per_iteration": 2.821582317352295 + }, + { + "auxiliary_loss_clip": 0.01099234, + "auxiliary_loss_mlp": 0.01084232, + "balance_loss_clip": 1.02437949, + "balance_loss_mlp": 1.00402808, + "epoch": 0.6553237539830458, + "flos": 21799178012160.0, + "grad_norm": 1.7372006612975037, + "language_loss": 0.79852474, + "learning_rate": 1.1219199134999664e-06, + "loss": 0.82035935, + "num_input_tokens_seen": 117281430, + "step": 5450, + "time_per_iteration": 2.785550832748413 + }, + { + "auxiliary_loss_clip": 0.01095498, + "auxiliary_loss_mlp": 0.01084633, + "balance_loss_clip": 1.02548647, + "balance_loss_mlp": 1.00428605, + "epoch": 0.6554439968736848, + "flos": 20887226588160.0, + "grad_norm": 2.1846879796497696, + "language_loss": 0.78500843, + "learning_rate": 1.1212200994146863e-06, + "loss": 0.80680972, + "num_input_tokens_seen": 117299185, + "step": 5451, + "time_per_iteration": 2.6482155323028564 + }, + { + "auxiliary_loss_clip": 0.01107958, + "auxiliary_loss_mlp": 0.01083654, + "balance_loss_clip": 1.02299309, + "balance_loss_mlp": 1.00354552, + "epoch": 0.655564239764324, + "flos": 16139045698560.0, + "grad_norm": 1.6919158654019455, + "language_loss": 0.75648165, + "learning_rate": 1.120520418639618e-06, + "loss": 0.7783978, + "num_input_tokens_seen": 117317720, + "step": 5452, + "time_per_iteration": 2.726184606552124 + }, + { + "auxiliary_loss_clip": 0.01126614, + "auxiliary_loss_mlp": 0.01084444, + "balance_loss_clip": 1.02613151, + "balance_loss_mlp": 1.00424004, + "epoch": 0.655684482654963, + "flos": 29570354496000.0, + "grad_norm": 1.9953676635394877, + "language_loss": 0.8346256, + "learning_rate": 1.119820871280903e-06, + "loss": 0.85673618, + "num_input_tokens_seen": 117338795, + "step": 5453, + "time_per_iteration": 3.5931384563446045 + }, + { + "auxiliary_loss_clip": 0.01128062, + "auxiliary_loss_mlp": 0.0108417, + "balance_loss_clip": 1.02601194, + "balance_loss_mlp": 1.00387025, + "epoch": 0.6558047255456021, + "flos": 29789409588480.0, + "grad_norm": 1.7707054657077765, + "language_loss": 0.73186469, + "learning_rate": 1.1191214574446614e-06, + "loss": 0.75398701, + "num_input_tokens_seen": 117359040, + "step": 5454, + "time_per_iteration": 2.6861321926116943 + }, + { + "auxiliary_loss_clip": 0.01116719, + "auxiliary_loss_mlp": 0.0108455, + "balance_loss_clip": 1.02425742, + "balance_loss_mlp": 1.00429821, + "epoch": 0.6559249684362413, + "flos": 29059166090880.0, + "grad_norm": 1.3885291108865896, + "language_loss": 0.80050576, + "learning_rate": 1.118422177236995e-06, + "loss": 0.82251847, + "num_input_tokens_seen": 117380865, + "step": 5455, + "time_per_iteration": 2.727268695831299 + }, + { + "auxiliary_loss_clip": 0.01116926, + "auxiliary_loss_mlp": 0.01085315, + "balance_loss_clip": 1.02463388, + "balance_loss_mlp": 1.00501573, + "epoch": 0.6560452113268803, + "flos": 20225464369920.0, + "grad_norm": 1.8005083044354897, + "language_loss": 0.85679495, + "learning_rate": 1.1177230307639835e-06, + "loss": 0.87881738, + "num_input_tokens_seen": 117398405, + "step": 5456, + "time_per_iteration": 2.7441458702087402 + }, + { + "auxiliary_loss_clip": 0.01108626, + "auxiliary_loss_mlp": 0.01084517, + "balance_loss_clip": 1.02489972, + "balance_loss_mlp": 1.00426567, + "epoch": 0.6561654542175194, + "flos": 25045538330880.0, + "grad_norm": 1.795988337341745, + "language_loss": 0.78463638, + "learning_rate": 1.1170240181316865e-06, + "loss": 0.80656779, + "num_input_tokens_seen": 117419850, + "step": 5457, + "time_per_iteration": 2.8293652534484863 + }, + { + "auxiliary_loss_clip": 0.01107873, + "auxiliary_loss_mlp": 0.01085297, + "balance_loss_clip": 1.02282739, + "balance_loss_mlp": 1.00499761, + "epoch": 0.6562856971081584, + "flos": 22856711258880.0, + "grad_norm": 2.874755530100832, + "language_loss": 0.79421651, + "learning_rate": 1.1163251394461442e-06, + "loss": 0.81614822, + "num_input_tokens_seen": 117438330, + "step": 5458, + "time_per_iteration": 4.684141635894775 + }, + { + "auxiliary_loss_clip": 0.01128475, + "auxiliary_loss_mlp": 0.01083885, + "balance_loss_clip": 1.02627504, + "balance_loss_mlp": 1.00353789, + "epoch": 0.6564059399987976, + "flos": 18872565586560.0, + "grad_norm": 1.943040230782104, + "language_loss": 0.82511586, + "learning_rate": 1.1156263948133746e-06, + "loss": 0.84723943, + "num_input_tokens_seen": 117454985, + "step": 5459, + "time_per_iteration": 2.7404918670654297 + }, + { + "auxiliary_loss_clip": 0.01097339, + "auxiliary_loss_mlp": 0.00872962, + "balance_loss_clip": 1.02242315, + "balance_loss_mlp": 1.00009453, + "epoch": 0.6565261828894366, + "flos": 25484187219840.0, + "grad_norm": 1.65497425705136, + "language_loss": 0.77649683, + "learning_rate": 1.1149277843393787e-06, + "loss": 0.79619986, + "num_input_tokens_seen": 117476145, + "step": 5460, + "time_per_iteration": 3.768475294113159 + }, + { + "auxiliary_loss_clip": 0.01076394, + "auxiliary_loss_mlp": 0.00872853, + "balance_loss_clip": 1.02402902, + "balance_loss_mlp": 1.00009251, + "epoch": 0.6566464257800757, + "flos": 19683500987520.0, + "grad_norm": 1.9134927888526065, + "language_loss": 0.63054985, + "learning_rate": 1.1142293081301342e-06, + "loss": 0.6500423, + "num_input_tokens_seen": 117494025, + "step": 5461, + "time_per_iteration": 2.81537127494812 + }, + { + "auxiliary_loss_clip": 0.01117875, + "auxiliary_loss_mlp": 0.01084857, + "balance_loss_clip": 1.02512085, + "balance_loss_mlp": 1.00479579, + "epoch": 0.6567666686707149, + "flos": 23514127931520.0, + "grad_norm": 1.4999569147028056, + "language_loss": 0.67790699, + "learning_rate": 1.1135309662915995e-06, + "loss": 0.6999343, + "num_input_tokens_seen": 117514190, + "step": 5462, + "time_per_iteration": 2.748692750930786 + }, + { + "auxiliary_loss_clip": 0.01082578, + "auxiliary_loss_mlp": 0.01084421, + "balance_loss_clip": 1.02322412, + "balance_loss_mlp": 1.00412142, + "epoch": 0.6568869115613539, + "flos": 32781342896640.0, + "grad_norm": 1.856336851663268, + "language_loss": 0.60307181, + "learning_rate": 1.112832758929712e-06, + "loss": 0.62474179, + "num_input_tokens_seen": 117536800, + "step": 5463, + "time_per_iteration": 2.8329834938049316 + }, + { + "auxiliary_loss_clip": 0.01119486, + "auxiliary_loss_mlp": 0.01084091, + "balance_loss_clip": 1.02439785, + "balance_loss_mlp": 1.00383914, + "epoch": 0.657007154451993, + "flos": 18442428220800.0, + "grad_norm": 1.658350424241677, + "language_loss": 0.74917209, + "learning_rate": 1.11213468615039e-06, + "loss": 0.77120781, + "num_input_tokens_seen": 117556230, + "step": 5464, + "time_per_iteration": 2.691575288772583 + }, + { + "auxiliary_loss_clip": 0.01074158, + "auxiliary_loss_mlp": 0.01084442, + "balance_loss_clip": 1.02362275, + "balance_loss_mlp": 1.00433302, + "epoch": 0.6571273973426321, + "flos": 25156717902720.0, + "grad_norm": 1.5641324388372466, + "language_loss": 0.75303167, + "learning_rate": 1.1114367480595292e-06, + "loss": 0.77461767, + "num_input_tokens_seen": 117577310, + "step": 5465, + "time_per_iteration": 2.8394293785095215 + }, + { + "auxiliary_loss_clip": 0.01081735, + "auxiliary_loss_mlp": 0.01085576, + "balance_loss_clip": 1.01850832, + "balance_loss_mlp": 1.00518143, + "epoch": 0.6572476402332712, + "flos": 17529830352000.0, + "grad_norm": 1.8760792175368708, + "language_loss": 0.81384063, + "learning_rate": 1.1107389447630086e-06, + "loss": 0.83551365, + "num_input_tokens_seen": 117596010, + "step": 5466, + "time_per_iteration": 2.827500581741333 + }, + { + "auxiliary_loss_clip": 0.0111856, + "auxiliary_loss_mlp": 0.00872847, + "balance_loss_clip": 1.02500832, + "balance_loss_mlp": 1.00013471, + "epoch": 0.6573678831239103, + "flos": 17014260487680.0, + "grad_norm": 2.222151017262253, + "language_loss": 0.78099036, + "learning_rate": 1.1100412763666818e-06, + "loss": 0.80090439, + "num_input_tokens_seen": 117611270, + "step": 5467, + "time_per_iteration": 2.66969895362854 + }, + { + "auxiliary_loss_clip": 0.01116815, + "auxiliary_loss_mlp": 0.01084781, + "balance_loss_clip": 1.02445877, + "balance_loss_mlp": 1.00462437, + "epoch": 0.6574881260145494, + "flos": 23910078528000.0, + "grad_norm": 1.4243076908085095, + "language_loss": 0.79715949, + "learning_rate": 1.1093437429763865e-06, + "loss": 0.81917548, + "num_input_tokens_seen": 117631535, + "step": 5468, + "time_per_iteration": 2.739025831222534 + }, + { + "auxiliary_loss_clip": 0.01127315, + "auxiliary_loss_mlp": 0.01084039, + "balance_loss_clip": 1.02615511, + "balance_loss_mlp": 1.00393009, + "epoch": 0.6576083689051885, + "flos": 11218458504960.0, + "grad_norm": 1.941028013948078, + "language_loss": 0.73323894, + "learning_rate": 1.1086463446979361e-06, + "loss": 0.7553525, + "num_input_tokens_seen": 117649885, + "step": 5469, + "time_per_iteration": 2.6802473068237305 + }, + { + "auxiliary_loss_clip": 0.01125922, + "auxiliary_loss_mlp": 0.01084579, + "balance_loss_clip": 1.02522898, + "balance_loss_mlp": 1.00437498, + "epoch": 0.6577286117958275, + "flos": 22455553190400.0, + "grad_norm": 2.1048707725582676, + "language_loss": 0.7705934, + "learning_rate": 1.1079490816371277e-06, + "loss": 0.79269844, + "num_input_tokens_seen": 117669650, + "step": 5470, + "time_per_iteration": 2.665409564971924 + }, + { + "auxiliary_loss_clip": 0.01124344, + "auxiliary_loss_mlp": 0.00872934, + "balance_loss_clip": 1.02381957, + "balance_loss_mlp": 1.00006795, + "epoch": 0.6578488546864667, + "flos": 21872184405120.0, + "grad_norm": 1.9504080836150344, + "language_loss": 0.7473352, + "learning_rate": 1.1072519538997352e-06, + "loss": 0.76730794, + "num_input_tokens_seen": 117688790, + "step": 5471, + "time_per_iteration": 2.708923816680908 + }, + { + "auxiliary_loss_clip": 0.01116281, + "auxiliary_loss_mlp": 0.01083793, + "balance_loss_clip": 1.02377617, + "balance_loss_mlp": 1.00344634, + "epoch": 0.6579690975771058, + "flos": 23543753673600.0, + "grad_norm": 1.6487672173942953, + "language_loss": 0.82565808, + "learning_rate": 1.1065549615915095e-06, + "loss": 0.84765887, + "num_input_tokens_seen": 117708620, + "step": 5472, + "time_per_iteration": 2.697471857070923 + }, + { + "auxiliary_loss_clip": 0.01127493, + "auxiliary_loss_mlp": 0.01085072, + "balance_loss_clip": 1.026932, + "balance_loss_mlp": 1.00482082, + "epoch": 0.6580893404677448, + "flos": 32743995730560.0, + "grad_norm": 2.364090097480586, + "language_loss": 0.78224301, + "learning_rate": 1.105858104818187e-06, + "loss": 0.80436873, + "num_input_tokens_seen": 117729775, + "step": 5473, + "time_per_iteration": 2.745044708251953 + }, + { + "auxiliary_loss_clip": 0.01127908, + "auxiliary_loss_mlp": 0.01084594, + "balance_loss_clip": 1.02656841, + "balance_loss_mlp": 1.00419927, + "epoch": 0.658209583358384, + "flos": 15888138220800.0, + "grad_norm": 3.295221128325905, + "language_loss": 0.74507475, + "learning_rate": 1.105161383685478e-06, + "loss": 0.76719975, + "num_input_tokens_seen": 117746160, + "step": 5474, + "time_per_iteration": 2.6551668643951416 + }, + { + "auxiliary_loss_clip": 0.01090376, + "auxiliary_loss_mlp": 0.01079371, + "balance_loss_clip": 1.02040505, + "balance_loss_mlp": 1.00040674, + "epoch": 0.658329826249023, + "flos": 62695902447360.0, + "grad_norm": 0.8623401064559421, + "language_loss": 0.5635891, + "learning_rate": 1.1044647982990771e-06, + "loss": 0.58528662, + "num_input_tokens_seen": 117808045, + "step": 5475, + "time_per_iteration": 3.255718946456909 + }, + { + "auxiliary_loss_clip": 0.01116143, + "auxiliary_loss_mlp": 0.01084816, + "balance_loss_clip": 1.02513838, + "balance_loss_mlp": 1.00451636, + "epoch": 0.6584500691396621, + "flos": 31722624501120.0, + "grad_norm": 2.12183907698765, + "language_loss": 0.64769959, + "learning_rate": 1.1037683487646536e-06, + "loss": 0.66970909, + "num_input_tokens_seen": 117828330, + "step": 5476, + "time_per_iteration": 2.787167549133301 + }, + { + "auxiliary_loss_clip": 0.0111612, + "auxiliary_loss_mlp": 0.00872849, + "balance_loss_clip": 1.02524161, + "balance_loss_mlp": 1.00011373, + "epoch": 0.6585703120303013, + "flos": 18406086635520.0, + "grad_norm": 2.7072316993932346, + "language_loss": 0.76846027, + "learning_rate": 1.1030720351878583e-06, + "loss": 0.78834999, + "num_input_tokens_seen": 117846450, + "step": 5477, + "time_per_iteration": 2.6634361743927 + }, + { + "auxiliary_loss_clip": 0.0110029, + "auxiliary_loss_mlp": 0.01079172, + "balance_loss_clip": 1.02090788, + "balance_loss_mlp": 1.00020778, + "epoch": 0.6586905549209403, + "flos": 58309880434560.0, + "grad_norm": 0.8070793410593887, + "language_loss": 0.57620251, + "learning_rate": 1.102375857674323e-06, + "loss": 0.59799713, + "num_input_tokens_seen": 117908365, + "step": 5478, + "time_per_iteration": 3.2317159175872803 + }, + { + "auxiliary_loss_clip": 0.01119398, + "auxiliary_loss_mlp": 0.01084255, + "balance_loss_clip": 1.02532578, + "balance_loss_mlp": 1.00405085, + "epoch": 0.6588107978115794, + "flos": 22782627457920.0, + "grad_norm": 1.7245562835727677, + "language_loss": 0.9026736, + "learning_rate": 1.1016798163296561e-06, + "loss": 0.92471015, + "num_input_tokens_seen": 117927565, + "step": 5479, + "time_per_iteration": 3.5453171730041504 + }, + { + "auxiliary_loss_clip": 0.01110505, + "auxiliary_loss_mlp": 0.01084944, + "balance_loss_clip": 1.02619267, + "balance_loss_mlp": 1.00469196, + "epoch": 0.6589310407022185, + "flos": 20667525050880.0, + "grad_norm": 1.8372831833778962, + "language_loss": 0.66181403, + "learning_rate": 1.1009839112594471e-06, + "loss": 0.68376851, + "num_input_tokens_seen": 117945590, + "step": 5480, + "time_per_iteration": 2.63757586479187 + }, + { + "auxiliary_loss_clip": 0.01126742, + "auxiliary_loss_mlp": 0.01084448, + "balance_loss_clip": 1.02531135, + "balance_loss_mlp": 1.00419629, + "epoch": 0.6590512835928576, + "flos": 25630595055360.0, + "grad_norm": 2.1069352126679166, + "language_loss": 0.72300816, + "learning_rate": 1.1002881425692638e-06, + "loss": 0.74512011, + "num_input_tokens_seen": 117966020, + "step": 5481, + "time_per_iteration": 2.761704444885254 + }, + { + "auxiliary_loss_clip": 0.01127321, + "auxiliary_loss_mlp": 0.01084352, + "balance_loss_clip": 1.02549398, + "balance_loss_mlp": 1.0041008, + "epoch": 0.6591715264834966, + "flos": 23726108044800.0, + "grad_norm": 1.6408701624086501, + "language_loss": 0.7550149, + "learning_rate": 1.0995925103646532e-06, + "loss": 0.77713162, + "num_input_tokens_seen": 117984620, + "step": 5482, + "time_per_iteration": 2.6684486865997314 + }, + { + "auxiliary_loss_clip": 0.01108622, + "auxiliary_loss_mlp": 0.01084772, + "balance_loss_clip": 1.02518666, + "balance_loss_mlp": 1.00456774, + "epoch": 0.6592917693741358, + "flos": 35773850822400.0, + "grad_norm": 1.5666359946059258, + "language_loss": 0.665663, + "learning_rate": 1.0988970147511437e-06, + "loss": 0.68759692, + "num_input_tokens_seen": 118006500, + "step": 5483, + "time_per_iteration": 3.7357115745544434 + }, + { + "auxiliary_loss_clip": 0.01116902, + "auxiliary_loss_mlp": 0.01083607, + "balance_loss_clip": 1.02560866, + "balance_loss_mlp": 1.00326037, + "epoch": 0.6594120122647749, + "flos": 21396834794880.0, + "grad_norm": 1.9660609416271828, + "language_loss": 0.80779344, + "learning_rate": 1.0982016558342405e-06, + "loss": 0.82979852, + "num_input_tokens_seen": 118025470, + "step": 5484, + "time_per_iteration": 3.6638622283935547 + }, + { + "auxiliary_loss_clip": 0.01136153, + "auxiliary_loss_mlp": 0.01085095, + "balance_loss_clip": 1.02640438, + "balance_loss_mlp": 1.00484347, + "epoch": 0.6595322551554139, + "flos": 19351829779200.0, + "grad_norm": 1.6981227436815047, + "language_loss": 0.70874202, + "learning_rate": 1.0975064337194291e-06, + "loss": 0.73095453, + "num_input_tokens_seen": 118043515, + "step": 5485, + "time_per_iteration": 3.579758644104004 + }, + { + "auxiliary_loss_clip": 0.01104195, + "auxiliary_loss_mlp": 0.01084664, + "balance_loss_clip": 1.02135301, + "balance_loss_mlp": 1.00441265, + "epoch": 0.6596524980460531, + "flos": 16837113588480.0, + "grad_norm": 1.4839842835714057, + "language_loss": 0.70496505, + "learning_rate": 1.0968113485121743e-06, + "loss": 0.72685361, + "num_input_tokens_seen": 118063105, + "step": 5486, + "time_per_iteration": 2.812177896499634 + }, + { + "auxiliary_loss_clip": 0.01127033, + "auxiliary_loss_mlp": 0.0087295, + "balance_loss_clip": 1.02562571, + "balance_loss_mlp": 1.00008762, + "epoch": 0.6597727409366921, + "flos": 21798567480960.0, + "grad_norm": 1.761875701162192, + "language_loss": 0.80070293, + "learning_rate": 1.0961164003179185e-06, + "loss": 0.82070279, + "num_input_tokens_seen": 118081615, + "step": 5487, + "time_per_iteration": 2.610281467437744 + }, + { + "auxiliary_loss_clip": 0.01108626, + "auxiliary_loss_mlp": 0.0108451, + "balance_loss_clip": 1.02396166, + "balance_loss_mlp": 1.00425851, + "epoch": 0.6598929838273312, + "flos": 23730704985600.0, + "grad_norm": 2.023731086846583, + "language_loss": 0.84120297, + "learning_rate": 1.0954215892420884e-06, + "loss": 0.86313438, + "num_input_tokens_seen": 118102315, + "step": 5488, + "time_per_iteration": 2.8129281997680664 + }, + { + "auxiliary_loss_clip": 0.01107161, + "auxiliary_loss_mlp": 0.01084605, + "balance_loss_clip": 1.02362132, + "balance_loss_mlp": 1.00435376, + "epoch": 0.6600132267179702, + "flos": 19974520978560.0, + "grad_norm": 1.6066702934176202, + "language_loss": 0.70516539, + "learning_rate": 1.094726915390082e-06, + "loss": 0.72708309, + "num_input_tokens_seen": 118120650, + "step": 5489, + "time_per_iteration": 2.7200355529785156 + }, + { + "auxiliary_loss_clip": 0.01126071, + "auxiliary_loss_mlp": 0.01083846, + "balance_loss_clip": 1.02574766, + "balance_loss_mlp": 1.00364184, + "epoch": 0.6601334696086094, + "flos": 22342649765760.0, + "grad_norm": 1.7108423055158157, + "language_loss": 0.69482034, + "learning_rate": 1.0940323788672836e-06, + "loss": 0.71691948, + "num_input_tokens_seen": 118139825, + "step": 5490, + "time_per_iteration": 2.718435049057007 + }, + { + "auxiliary_loss_clip": 0.01124324, + "auxiliary_loss_mlp": 0.01084421, + "balance_loss_clip": 1.02500987, + "balance_loss_mlp": 1.00421739, + "epoch": 0.6602537124992485, + "flos": 25703098657920.0, + "grad_norm": 1.67431858157534, + "language_loss": 0.73758459, + "learning_rate": 1.0933379797790522e-06, + "loss": 0.75967211, + "num_input_tokens_seen": 118159240, + "step": 5491, + "time_per_iteration": 2.689927339553833 + }, + { + "auxiliary_loss_clip": 0.01136644, + "auxiliary_loss_mlp": 0.01085456, + "balance_loss_clip": 1.02695632, + "balance_loss_mlp": 1.00520456, + "epoch": 0.6603739553898875, + "flos": 25848572739840.0, + "grad_norm": 2.4594424227409744, + "language_loss": 0.71109343, + "learning_rate": 1.0926437182307293e-06, + "loss": 0.73331451, + "num_input_tokens_seen": 118178050, + "step": 5492, + "time_per_iteration": 2.6879611015319824 + }, + { + "auxiliary_loss_clip": 0.01117503, + "auxiliary_loss_mlp": 0.01083556, + "balance_loss_clip": 1.02436841, + "balance_loss_mlp": 1.003304, + "epoch": 0.6604941982805267, + "flos": 24570296461440.0, + "grad_norm": 1.9055644518261459, + "language_loss": 0.7791431, + "learning_rate": 1.0919495943276338e-06, + "loss": 0.80115366, + "num_input_tokens_seen": 118199070, + "step": 5493, + "time_per_iteration": 2.7206668853759766 + }, + { + "auxiliary_loss_clip": 0.01111827, + "auxiliary_loss_mlp": 0.01084432, + "balance_loss_clip": 1.02515256, + "balance_loss_mlp": 1.00413311, + "epoch": 0.6606144411711657, + "flos": 13261775581440.0, + "grad_norm": 2.0525379151696765, + "language_loss": 0.75634253, + "learning_rate": 1.0912556081750611e-06, + "loss": 0.77830517, + "num_input_tokens_seen": 118217000, + "step": 5494, + "time_per_iteration": 2.740715742111206 + }, + { + "auxiliary_loss_clip": 0.01114728, + "auxiliary_loss_mlp": 0.01084169, + "balance_loss_clip": 1.02401626, + "balance_loss_mlp": 1.00401258, + "epoch": 0.6607346840618048, + "flos": 25155281358720.0, + "grad_norm": 1.7155652397891903, + "language_loss": 0.75956768, + "learning_rate": 1.0905617598782909e-06, + "loss": 0.78155661, + "num_input_tokens_seen": 118237205, + "step": 5495, + "time_per_iteration": 2.7374157905578613 + }, + { + "auxiliary_loss_clip": 0.01099034, + "auxiliary_loss_mlp": 0.01084076, + "balance_loss_clip": 1.02416396, + "balance_loss_mlp": 1.00391984, + "epoch": 0.660854926952444, + "flos": 17638029095040.0, + "grad_norm": 1.972952697091814, + "language_loss": 0.81259894, + "learning_rate": 1.0898680495425775e-06, + "loss": 0.83442998, + "num_input_tokens_seen": 118255495, + "step": 5496, + "time_per_iteration": 2.764357805252075 + }, + { + "auxiliary_loss_clip": 0.01117832, + "auxiliary_loss_mlp": 0.01085785, + "balance_loss_clip": 1.02525282, + "balance_loss_mlp": 1.00558078, + "epoch": 0.660975169843083, + "flos": 16836000266880.0, + "grad_norm": 1.6662368226742912, + "language_loss": 0.80166316, + "learning_rate": 1.0891744772731594e-06, + "loss": 0.82369936, + "num_input_tokens_seen": 118273310, + "step": 5497, + "time_per_iteration": 2.686145067214966 + }, + { + "auxiliary_loss_clip": 0.01127085, + "auxiliary_loss_mlp": 0.01084216, + "balance_loss_clip": 1.02555728, + "balance_loss_mlp": 1.00405931, + "epoch": 0.6610954127337221, + "flos": 26870410846080.0, + "grad_norm": 1.71290583714228, + "language_loss": 0.65959579, + "learning_rate": 1.088481043175248e-06, + "loss": 0.68170881, + "num_input_tokens_seen": 118293880, + "step": 5498, + "time_per_iteration": 2.717851161956787 + }, + { + "auxiliary_loss_clip": 0.01118731, + "auxiliary_loss_mlp": 0.01084249, + "balance_loss_clip": 1.02542675, + "balance_loss_mlp": 1.00404501, + "epoch": 0.6612156556243612, + "flos": 26465697331200.0, + "grad_norm": 1.7866261988185859, + "language_loss": 0.7574842, + "learning_rate": 1.0877877473540368e-06, + "loss": 0.77951401, + "num_input_tokens_seen": 118314465, + "step": 5499, + "time_per_iteration": 2.7617549896240234 + }, + { + "auxiliary_loss_clip": 0.01136094, + "auxiliary_loss_mlp": 0.01084807, + "balance_loss_clip": 1.02587199, + "balance_loss_mlp": 1.00460279, + "epoch": 0.6613358985150003, + "flos": 19791915212160.0, + "grad_norm": 1.730862500441942, + "language_loss": 0.72609866, + "learning_rate": 1.0870945899147002e-06, + "loss": 0.74830765, + "num_input_tokens_seen": 118331110, + "step": 5500, + "time_per_iteration": 2.6702613830566406 + }, + { + "auxiliary_loss_clip": 0.01125415, + "auxiliary_loss_mlp": 0.01083846, + "balance_loss_clip": 1.02508867, + "balance_loss_mlp": 1.00368941, + "epoch": 0.6614561414056394, + "flos": 26831627136000.0, + "grad_norm": 1.9255134903935025, + "language_loss": 0.76502472, + "learning_rate": 1.0864015709623879e-06, + "loss": 0.78711736, + "num_input_tokens_seen": 118351980, + "step": 5501, + "time_per_iteration": 2.746572971343994 + }, + { + "auxiliary_loss_clip": 0.01126803, + "auxiliary_loss_mlp": 0.01084687, + "balance_loss_clip": 1.02488363, + "balance_loss_mlp": 1.00448251, + "epoch": 0.6615763842962785, + "flos": 22894597128960.0, + "grad_norm": 2.237431031824167, + "language_loss": 0.80268991, + "learning_rate": 1.0857086906022313e-06, + "loss": 0.82480478, + "num_input_tokens_seen": 118370315, + "step": 5502, + "time_per_iteration": 2.7140848636627197 + }, + { + "auxiliary_loss_clip": 0.01082557, + "auxiliary_loss_mlp": 0.01083778, + "balance_loss_clip": 1.01895952, + "balance_loss_mlp": 1.00362206, + "epoch": 0.6616966271869176, + "flos": 24790321221120.0, + "grad_norm": 1.974437411317424, + "language_loss": 0.73128802, + "learning_rate": 1.0850159489393388e-06, + "loss": 0.75295138, + "num_input_tokens_seen": 118389575, + "step": 5503, + "time_per_iteration": 2.9279961585998535 + }, + { + "auxiliary_loss_clip": 0.01107425, + "auxiliary_loss_mlp": 0.01083423, + "balance_loss_clip": 1.02261484, + "balance_loss_mlp": 1.0033139, + "epoch": 0.6618168700775566, + "flos": 17202109639680.0, + "grad_norm": 2.151267641658424, + "language_loss": 0.82422864, + "learning_rate": 1.0843233460787992e-06, + "loss": 0.84613705, + "num_input_tokens_seen": 118406790, + "step": 5504, + "time_per_iteration": 2.7444427013397217 + }, + { + "auxiliary_loss_clip": 0.01102757, + "auxiliary_loss_mlp": 0.01084387, + "balance_loss_clip": 1.02115405, + "balance_loss_mlp": 1.0042305, + "epoch": 0.6619371129681958, + "flos": 25447091448960.0, + "grad_norm": 3.4919450364479903, + "language_loss": 0.78183407, + "learning_rate": 1.0836308821256805e-06, + "loss": 0.80370551, + "num_input_tokens_seen": 118427590, + "step": 5505, + "time_per_iteration": 3.7302138805389404 + }, + { + "auxiliary_loss_clip": 0.01123703, + "auxiliary_loss_mlp": 0.01084211, + "balance_loss_clip": 1.02391529, + "balance_loss_mlp": 1.00410199, + "epoch": 0.6620573558588349, + "flos": 18040444139520.0, + "grad_norm": 1.9264594377469342, + "language_loss": 0.78013766, + "learning_rate": 1.0829385571850282e-06, + "loss": 0.80221683, + "num_input_tokens_seen": 118444570, + "step": 5506, + "time_per_iteration": 2.699036121368408 + }, + { + "auxiliary_loss_clip": 0.0113592, + "auxiliary_loss_mlp": 0.01087529, + "balance_loss_clip": 1.0256176, + "balance_loss_mlp": 1.00718212, + "epoch": 0.6621775987494739, + "flos": 17785586165760.0, + "grad_norm": 2.6180393577958854, + "language_loss": 0.83733398, + "learning_rate": 1.0822463713618679e-06, + "loss": 0.85956848, + "num_input_tokens_seen": 118461425, + "step": 5507, + "time_per_iteration": 2.5756969451904297 + }, + { + "auxiliary_loss_clip": 0.0110614, + "auxiliary_loss_mlp": 0.01084785, + "balance_loss_clip": 1.02314019, + "balance_loss_mlp": 1.00462878, + "epoch": 0.6622978416401131, + "flos": 17492590926720.0, + "grad_norm": 2.135679247904913, + "language_loss": 0.84805423, + "learning_rate": 1.0815543247612034e-06, + "loss": 0.86996353, + "num_input_tokens_seen": 118478495, + "step": 5508, + "time_per_iteration": 3.7385923862457275 + }, + { + "auxiliary_loss_clip": 0.01117882, + "auxiliary_loss_mlp": 0.01083731, + "balance_loss_clip": 1.02433825, + "balance_loss_mlp": 1.0035274, + "epoch": 0.6624180845307521, + "flos": 21648352803840.0, + "grad_norm": 1.5145335593948228, + "language_loss": 0.83077002, + "learning_rate": 1.0808624174880168e-06, + "loss": 0.85278612, + "num_input_tokens_seen": 118499145, + "step": 5509, + "time_per_iteration": 3.6231260299682617 + }, + { + "auxiliary_loss_clip": 0.01136881, + "auxiliary_loss_mlp": 0.01083838, + "balance_loss_clip": 1.02741051, + "balance_loss_mlp": 1.00368214, + "epoch": 0.6625383274213912, + "flos": 23805902108160.0, + "grad_norm": 1.6749987489000755, + "language_loss": 0.79730135, + "learning_rate": 1.080170649647272e-06, + "loss": 0.81950855, + "num_input_tokens_seen": 118518950, + "step": 5510, + "time_per_iteration": 3.61462664604187 + }, + { + "auxiliary_loss_clip": 0.01135162, + "auxiliary_loss_mlp": 0.01083962, + "balance_loss_clip": 1.02593589, + "balance_loss_mlp": 1.00380599, + "epoch": 0.6626585703120303, + "flos": 33262941473280.0, + "grad_norm": 2.849445078068884, + "language_loss": 0.67295444, + "learning_rate": 1.0794790213439068e-06, + "loss": 0.69514567, + "num_input_tokens_seen": 118545850, + "step": 5511, + "time_per_iteration": 2.7648253440856934 + }, + { + "auxiliary_loss_clip": 0.01098801, + "auxiliary_loss_mlp": 0.01085098, + "balance_loss_clip": 1.02315545, + "balance_loss_mlp": 1.00479877, + "epoch": 0.6627788132026694, + "flos": 22085780630400.0, + "grad_norm": 2.0786987267326458, + "language_loss": 0.78640342, + "learning_rate": 1.078787532682843e-06, + "loss": 0.80824232, + "num_input_tokens_seen": 118563325, + "step": 5512, + "time_per_iteration": 2.80574107170105 + }, + { + "auxiliary_loss_clip": 0.0112577, + "auxiliary_loss_mlp": 0.01084189, + "balance_loss_clip": 1.02532029, + "balance_loss_mlp": 1.00393701, + "epoch": 0.6628990560933085, + "flos": 36173608260480.0, + "grad_norm": 3.7901091840744936, + "language_loss": 0.75623095, + "learning_rate": 1.0780961837689773e-06, + "loss": 0.7783305, + "num_input_tokens_seen": 118582835, + "step": 5513, + "time_per_iteration": 2.8245108127593994 + }, + { + "auxiliary_loss_clip": 0.0111719, + "auxiliary_loss_mlp": 0.01084473, + "balance_loss_clip": 1.02560711, + "balance_loss_mlp": 1.00431681, + "epoch": 0.6630192989839476, + "flos": 18513567106560.0, + "grad_norm": 1.5944290453699659, + "language_loss": 0.69962645, + "learning_rate": 1.0774049747071883e-06, + "loss": 0.72164303, + "num_input_tokens_seen": 118600715, + "step": 5514, + "time_per_iteration": 2.721684694290161 + }, + { + "auxiliary_loss_clip": 0.01097617, + "auxiliary_loss_mlp": 0.01084661, + "balance_loss_clip": 1.0243361, + "balance_loss_mlp": 1.00445747, + "epoch": 0.6631395418745867, + "flos": 35809510049280.0, + "grad_norm": 11.724952551717475, + "language_loss": 0.68260497, + "learning_rate": 1.076713905602332e-06, + "loss": 0.70442778, + "num_input_tokens_seen": 118621290, + "step": 5515, + "time_per_iteration": 2.8854269981384277 + }, + { + "auxiliary_loss_clip": 0.0112769, + "auxiliary_loss_mlp": 0.010843, + "balance_loss_clip": 1.0263052, + "balance_loss_mlp": 1.00409627, + "epoch": 0.6632597847652257, + "flos": 20047742853120.0, + "grad_norm": 1.6473444272076814, + "language_loss": 0.8101958, + "learning_rate": 1.07602297655924e-06, + "loss": 0.83231568, + "num_input_tokens_seen": 118639610, + "step": 5516, + "time_per_iteration": 2.6934471130371094 + }, + { + "auxiliary_loss_clip": 0.0113692, + "auxiliary_loss_mlp": 0.01084456, + "balance_loss_clip": 1.02719712, + "balance_loss_mlp": 1.00425208, + "epoch": 0.6633800276558649, + "flos": 21214480423680.0, + "grad_norm": 1.7233500371548498, + "language_loss": 0.80732739, + "learning_rate": 1.0753321876827292e-06, + "loss": 0.82954109, + "num_input_tokens_seen": 118658895, + "step": 5517, + "time_per_iteration": 2.6470346450805664 + }, + { + "auxiliary_loss_clip": 0.01135297, + "auxiliary_loss_mlp": 0.01084978, + "balance_loss_clip": 1.02587247, + "balance_loss_mlp": 1.00472593, + "epoch": 0.663500270546504, + "flos": 23987753688960.0, + "grad_norm": 1.9645014841794015, + "language_loss": 0.7383849, + "learning_rate": 1.0746415390775893e-06, + "loss": 0.76058769, + "num_input_tokens_seen": 118677025, + "step": 5518, + "time_per_iteration": 2.6622116565704346 + }, + { + "auxiliary_loss_clip": 0.01136205, + "auxiliary_loss_mlp": 0.01084662, + "balance_loss_clip": 1.02702963, + "balance_loss_mlp": 1.00460136, + "epoch": 0.663620513437143, + "flos": 17932389050880.0, + "grad_norm": 1.744358422855304, + "language_loss": 0.75924039, + "learning_rate": 1.0739510308485939e-06, + "loss": 0.78144908, + "num_input_tokens_seen": 118694240, + "step": 5519, + "time_per_iteration": 2.54961895942688 + }, + { + "auxiliary_loss_clip": 0.01076198, + "auxiliary_loss_mlp": 0.01079043, + "balance_loss_clip": 1.02157021, + "balance_loss_mlp": 1.0000788, + "epoch": 0.6637407563277821, + "flos": 57840241086720.0, + "grad_norm": 0.8199669278079361, + "language_loss": 0.62560654, + "learning_rate": 1.07326066310049e-06, + "loss": 0.64715898, + "num_input_tokens_seen": 118758365, + "step": 5520, + "time_per_iteration": 3.3449254035949707 + }, + { + "auxiliary_loss_clip": 0.01109455, + "auxiliary_loss_mlp": 0.01085225, + "balance_loss_clip": 1.02528572, + "balance_loss_mlp": 1.00497353, + "epoch": 0.6638609992184212, + "flos": 27306007079040.0, + "grad_norm": 1.7494006809989866, + "language_loss": 0.79103041, + "learning_rate": 1.0725704359380059e-06, + "loss": 0.81297719, + "num_input_tokens_seen": 118778220, + "step": 5521, + "time_per_iteration": 2.8263230323791504 + }, + { + "auxiliary_loss_clip": 0.01135569, + "auxiliary_loss_mlp": 0.01085511, + "balance_loss_clip": 1.02578926, + "balance_loss_mlp": 1.00521159, + "epoch": 0.6639812421090603, + "flos": 18624854419200.0, + "grad_norm": 1.7326914924193155, + "language_loss": 0.72227502, + "learning_rate": 1.0718803494658497e-06, + "loss": 0.7444858, + "num_input_tokens_seen": 118797110, + "step": 5522, + "time_per_iteration": 2.60408091545105 + }, + { + "auxiliary_loss_clip": 0.01050877, + "auxiliary_loss_mlp": 0.01083914, + "balance_loss_clip": 1.01900828, + "balance_loss_mlp": 1.0037576, + "epoch": 0.6641014849996993, + "flos": 15924479806080.0, + "grad_norm": 2.187374183687621, + "language_loss": 0.83896244, + "learning_rate": 1.071190403788707e-06, + "loss": 0.86031038, + "num_input_tokens_seen": 118812415, + "step": 5523, + "time_per_iteration": 3.017385482788086 + }, + { + "auxiliary_loss_clip": 0.01107936, + "auxiliary_loss_mlp": 0.01083922, + "balance_loss_clip": 1.02380657, + "balance_loss_mlp": 1.00366998, + "epoch": 0.6642217278903385, + "flos": 26505486622080.0, + "grad_norm": 4.570444517459865, + "language_loss": 0.75424832, + "learning_rate": 1.0705005990112415e-06, + "loss": 0.77616692, + "num_input_tokens_seen": 118832195, + "step": 5524, + "time_per_iteration": 3.0145070552825928 + }, + { + "auxiliary_loss_clip": 0.0109077, + "auxiliary_loss_mlp": 0.01083794, + "balance_loss_clip": 1.02325654, + "balance_loss_mlp": 1.00368524, + "epoch": 0.6643419707809776, + "flos": 15377308951680.0, + "grad_norm": 2.33663039535034, + "language_loss": 0.74181592, + "learning_rate": 1.0698109352380957e-06, + "loss": 0.76356155, + "num_input_tokens_seen": 118849795, + "step": 5525, + "time_per_iteration": 2.8280081748962402 + }, + { + "auxiliary_loss_clip": 0.01135391, + "auxiliary_loss_mlp": 0.01084307, + "balance_loss_clip": 1.0256691, + "balance_loss_mlp": 1.00410271, + "epoch": 0.6644622136716166, + "flos": 25117610970240.0, + "grad_norm": 1.919246085328405, + "language_loss": 0.77890658, + "learning_rate": 1.0691214125738909e-06, + "loss": 0.80110359, + "num_input_tokens_seen": 118870000, + "step": 5526, + "time_per_iteration": 2.665972948074341 + }, + { + "auxiliary_loss_clip": 0.01117178, + "auxiliary_loss_mlp": 0.01078844, + "balance_loss_clip": 1.02163363, + "balance_loss_mlp": 0.9998793, + "epoch": 0.6645824565622558, + "flos": 66201717680640.0, + "grad_norm": 0.7850092972459408, + "language_loss": 0.57543254, + "learning_rate": 1.0684320311232287e-06, + "loss": 0.5973928, + "num_input_tokens_seen": 118932905, + "step": 5527, + "time_per_iteration": 3.2942545413970947 + }, + { + "auxiliary_loss_clip": 0.01117705, + "auxiliary_loss_mlp": 0.0108571, + "balance_loss_clip": 1.0247817, + "balance_loss_mlp": 1.00545812, + "epoch": 0.6647026994528948, + "flos": 25082131311360.0, + "grad_norm": 1.7479915976354985, + "language_loss": 0.81324983, + "learning_rate": 1.0677427909906865e-06, + "loss": 0.83528399, + "num_input_tokens_seen": 118953355, + "step": 5528, + "time_per_iteration": 2.773014545440674 + }, + { + "auxiliary_loss_clip": 0.01135969, + "auxiliary_loss_mlp": 0.01084925, + "balance_loss_clip": 1.02576399, + "balance_loss_mlp": 1.00462532, + "epoch": 0.6648229423435339, + "flos": 18222187979520.0, + "grad_norm": 1.774763891943684, + "language_loss": 0.72354662, + "learning_rate": 1.0670536922808216e-06, + "loss": 0.74575555, + "num_input_tokens_seen": 118973480, + "step": 5529, + "time_per_iteration": 2.588122844696045 + }, + { + "auxiliary_loss_clip": 0.0111749, + "auxiliary_loss_mlp": 0.01084605, + "balance_loss_clip": 1.0251776, + "balance_loss_mlp": 1.00440061, + "epoch": 0.6649431852341731, + "flos": 18296882311680.0, + "grad_norm": 2.180262487088244, + "language_loss": 0.72215176, + "learning_rate": 1.06636473509817e-06, + "loss": 0.74417269, + "num_input_tokens_seen": 118989860, + "step": 5530, + "time_per_iteration": 3.708728313446045 + }, + { + "auxiliary_loss_clip": 0.01116893, + "auxiliary_loss_mlp": 0.0087284, + "balance_loss_clip": 1.02473807, + "balance_loss_mlp": 1.00011587, + "epoch": 0.6650634281248121, + "flos": 17019575700480.0, + "grad_norm": 1.9035244872276738, + "language_loss": 0.80619717, + "learning_rate": 1.0656759195472447e-06, + "loss": 0.82609445, + "num_input_tokens_seen": 119007150, + "step": 5531, + "time_per_iteration": 2.6654927730560303 + }, + { + "auxiliary_loss_clip": 0.0110165, + "auxiliary_loss_mlp": 0.01079852, + "balance_loss_clip": 1.02297819, + "balance_loss_mlp": 1.00088775, + "epoch": 0.6651836710154512, + "flos": 69294810666240.0, + "grad_norm": 0.7723110751132082, + "language_loss": 0.59764707, + "learning_rate": 1.0649872457325414e-06, + "loss": 0.61946207, + "num_input_tokens_seen": 119068435, + "step": 5532, + "time_per_iteration": 3.184044361114502 + }, + { + "auxiliary_loss_clip": 0.01109483, + "auxiliary_loss_mlp": 0.01078975, + "balance_loss_clip": 1.02190638, + "balance_loss_mlp": 1.00001073, + "epoch": 0.6653039139060903, + "flos": 66883444882560.0, + "grad_norm": 0.8519287295788622, + "language_loss": 0.55220765, + "learning_rate": 1.0642987137585278e-06, + "loss": 0.57409221, + "num_input_tokens_seen": 119127960, + "step": 5533, + "time_per_iteration": 3.1492440700531006 + }, + { + "auxiliary_loss_clip": 0.01117142, + "auxiliary_loss_mlp": 0.01084847, + "balance_loss_clip": 1.02517581, + "balance_loss_mlp": 1.00464296, + "epoch": 0.6654241567967294, + "flos": 21470056669440.0, + "grad_norm": 1.8895715599685134, + "language_loss": 0.82301778, + "learning_rate": 1.0636103237296561e-06, + "loss": 0.84503764, + "num_input_tokens_seen": 119146885, + "step": 5534, + "time_per_iteration": 3.499525785446167 + }, + { + "auxiliary_loss_clip": 0.01126478, + "auxiliary_loss_mlp": 0.01083533, + "balance_loss_clip": 1.02630162, + "balance_loss_mlp": 1.00347233, + "epoch": 0.6655443996873684, + "flos": 25119514391040.0, + "grad_norm": 1.9160105428001764, + "language_loss": 0.84567904, + "learning_rate": 1.062922075750353e-06, + "loss": 0.86777914, + "num_input_tokens_seen": 119166900, + "step": 5535, + "time_per_iteration": 3.40232515335083 + }, + { + "auxiliary_loss_clip": 0.01106003, + "auxiliary_loss_mlp": 0.01083827, + "balance_loss_clip": 1.02262747, + "balance_loss_mlp": 1.00357556, + "epoch": 0.6656646425780076, + "flos": 17457326749440.0, + "grad_norm": 2.1397061571324794, + "language_loss": 0.71897423, + "learning_rate": 1.0622339699250267e-06, + "loss": 0.7408725, + "num_input_tokens_seen": 119184820, + "step": 5536, + "time_per_iteration": 3.5797245502471924 + }, + { + "auxiliary_loss_clip": 0.0110201, + "auxiliary_loss_mlp": 0.01084145, + "balance_loss_clip": 1.02372956, + "balance_loss_mlp": 1.00408411, + "epoch": 0.6657848854686467, + "flos": 23434190213760.0, + "grad_norm": 1.7827808915051186, + "language_loss": 0.79375964, + "learning_rate": 1.0615460063580624e-06, + "loss": 0.8156212, + "num_input_tokens_seen": 119203295, + "step": 5537, + "time_per_iteration": 2.711344003677368 + }, + { + "auxiliary_loss_clip": 0.01116404, + "auxiliary_loss_mlp": 0.01082847, + "balance_loss_clip": 1.02363908, + "balance_loss_mlp": 1.00273836, + "epoch": 0.6659051283592857, + "flos": 11509909459200.0, + "grad_norm": 2.044934304354135, + "language_loss": 0.72931802, + "learning_rate": 1.060858185153821e-06, + "loss": 0.75131059, + "num_input_tokens_seen": 119221395, + "step": 5538, + "time_per_iteration": 2.668076515197754 + }, + { + "auxiliary_loss_clip": 0.01118764, + "auxiliary_loss_mlp": 0.01084955, + "balance_loss_clip": 1.0257988, + "balance_loss_mlp": 1.00451267, + "epoch": 0.6660253712499249, + "flos": 20594554571520.0, + "grad_norm": 2.395371835245855, + "language_loss": 0.75917584, + "learning_rate": 1.0601705064166474e-06, + "loss": 0.78121299, + "num_input_tokens_seen": 119239790, + "step": 5539, + "time_per_iteration": 2.7291882038116455 + }, + { + "auxiliary_loss_clip": 0.01111312, + "auxiliary_loss_mlp": 0.01084343, + "balance_loss_clip": 1.0220437, + "balance_loss_mlp": 1.00418687, + "epoch": 0.666145614140564, + "flos": 21251504367360.0, + "grad_norm": 2.149657081036499, + "language_loss": 0.73552406, + "learning_rate": 1.0594829702508596e-06, + "loss": 0.75748062, + "num_input_tokens_seen": 119257505, + "step": 5540, + "time_per_iteration": 2.7626655101776123 + }, + { + "auxiliary_loss_clip": 0.01108192, + "auxiliary_loss_mlp": 0.01084107, + "balance_loss_clip": 1.02463484, + "balance_loss_mlp": 1.00390315, + "epoch": 0.666265857031203, + "flos": 33726188200320.0, + "grad_norm": 1.5576496760559313, + "language_loss": 0.55047679, + "learning_rate": 1.0587955767607592e-06, + "loss": 0.5723998, + "num_input_tokens_seen": 119279365, + "step": 5541, + "time_per_iteration": 2.8854238986968994 + }, + { + "auxiliary_loss_clip": 0.01134892, + "auxiliary_loss_mlp": 0.01084903, + "balance_loss_clip": 1.02542484, + "balance_loss_mlp": 1.00469947, + "epoch": 0.6663860999218422, + "flos": 17456644391040.0, + "grad_norm": 2.5187083080446233, + "language_loss": 0.77096945, + "learning_rate": 1.0581083260506206e-06, + "loss": 0.79316741, + "num_input_tokens_seen": 119296150, + "step": 5542, + "time_per_iteration": 2.590456247329712 + }, + { + "auxiliary_loss_clip": 0.01116832, + "auxiliary_loss_mlp": 0.01084849, + "balance_loss_clip": 1.02477145, + "balance_loss_mlp": 1.00478768, + "epoch": 0.6665063428124812, + "flos": 17676740977920.0, + "grad_norm": 2.2657989989789526, + "language_loss": 0.76226568, + "learning_rate": 1.0574212182246993e-06, + "loss": 0.78428251, + "num_input_tokens_seen": 119314845, + "step": 5543, + "time_per_iteration": 2.7372710704803467 + }, + { + "auxiliary_loss_clip": 0.01116416, + "auxiliary_loss_mlp": 0.01083612, + "balance_loss_clip": 1.02393377, + "balance_loss_mlp": 1.00336051, + "epoch": 0.6666265857031203, + "flos": 27673265687040.0, + "grad_norm": 2.685604212643653, + "language_loss": 0.75908053, + "learning_rate": 1.0567342533872303e-06, + "loss": 0.78108078, + "num_input_tokens_seen": 119334875, + "step": 5544, + "time_per_iteration": 2.7887632846832275 + }, + { + "auxiliary_loss_clip": 0.01116651, + "auxiliary_loss_mlp": 0.01084395, + "balance_loss_clip": 1.02462816, + "balance_loss_mlp": 1.00423896, + "epoch": 0.6667468285937594, + "flos": 25046831220480.0, + "grad_norm": 1.8967831944816935, + "language_loss": 0.811396, + "learning_rate": 1.0560474316424255e-06, + "loss": 0.83340651, + "num_input_tokens_seen": 119354635, + "step": 5545, + "time_per_iteration": 2.8058900833129883 + }, + { + "auxiliary_loss_clip": 0.01118533, + "auxiliary_loss_mlp": 0.01084567, + "balance_loss_clip": 1.02505922, + "balance_loss_mlp": 1.00426793, + "epoch": 0.6668670714843985, + "flos": 22780472641920.0, + "grad_norm": 2.2131548972559054, + "language_loss": 0.73543108, + "learning_rate": 1.0553607530944746e-06, + "loss": 0.75746202, + "num_input_tokens_seen": 119372690, + "step": 5546, + "time_per_iteration": 2.7490785121917725 + }, + { + "auxiliary_loss_clip": 0.01109557, + "auxiliary_loss_mlp": 0.01084451, + "balance_loss_clip": 1.02534723, + "balance_loss_mlp": 1.00424743, + "epoch": 0.6669873143750376, + "flos": 22163886754560.0, + "grad_norm": 2.203237544617893, + "language_loss": 0.89396203, + "learning_rate": 1.0546742178475463e-06, + "loss": 0.91590214, + "num_input_tokens_seen": 119391685, + "step": 5547, + "time_per_iteration": 2.8334531784057617 + }, + { + "auxiliary_loss_clip": 0.01082337, + "auxiliary_loss_mlp": 0.01084132, + "balance_loss_clip": 1.02363539, + "balance_loss_mlp": 1.00407076, + "epoch": 0.6671075572656767, + "flos": 20514832335360.0, + "grad_norm": 1.8260179307798199, + "language_loss": 0.8691259, + "learning_rate": 1.0539878260057868e-06, + "loss": 0.89079058, + "num_input_tokens_seen": 119410725, + "step": 5548, + "time_per_iteration": 2.807014226913452 + }, + { + "auxiliary_loss_clip": 0.01121843, + "auxiliary_loss_mlp": 0.01085189, + "balance_loss_clip": 1.02280307, + "balance_loss_mlp": 1.00488997, + "epoch": 0.6672278001563158, + "flos": 17931203902080.0, + "grad_norm": 2.581015691762656, + "language_loss": 0.68595213, + "learning_rate": 1.0533015776733226e-06, + "loss": 0.70802242, + "num_input_tokens_seen": 119426875, + "step": 5549, + "time_per_iteration": 2.6214630603790283 + }, + { + "auxiliary_loss_clip": 0.01116259, + "auxiliary_loss_mlp": 0.01083924, + "balance_loss_clip": 1.02483571, + "balance_loss_mlp": 1.003672, + "epoch": 0.6673480430469548, + "flos": 22342146975360.0, + "grad_norm": 2.181896504403279, + "language_loss": 0.78458577, + "learning_rate": 1.0526154729542566e-06, + "loss": 0.80658758, + "num_input_tokens_seen": 119446935, + "step": 5550, + "time_per_iteration": 2.785985231399536 + }, + { + "auxiliary_loss_clip": 0.01107458, + "auxiliary_loss_mlp": 0.01084047, + "balance_loss_clip": 1.02529967, + "balance_loss_mlp": 1.00379574, + "epoch": 0.6674682859375939, + "flos": 20703830722560.0, + "grad_norm": 3.04037108397943, + "language_loss": 0.79556668, + "learning_rate": 1.0519295119526699e-06, + "loss": 0.81748176, + "num_input_tokens_seen": 119463240, + "step": 5551, + "time_per_iteration": 2.7171058654785156 + }, + { + "auxiliary_loss_clip": 0.01117529, + "auxiliary_loss_mlp": 0.01084205, + "balance_loss_clip": 1.02464664, + "balance_loss_mlp": 1.00404882, + "epoch": 0.667588528828233, + "flos": 26206673379840.0, + "grad_norm": 2.0021746423363846, + "language_loss": 0.82591808, + "learning_rate": 1.0512436947726227e-06, + "loss": 0.84793538, + "num_input_tokens_seen": 119484655, + "step": 5552, + "time_per_iteration": 2.9404568672180176 + }, + { + "auxiliary_loss_clip": 0.01106967, + "auxiliary_loss_mlp": 0.01083425, + "balance_loss_clip": 1.02385068, + "balance_loss_mlp": 1.00317311, + "epoch": 0.6677087717188721, + "flos": 23071025756160.0, + "grad_norm": 2.3161565404586457, + "language_loss": 0.65353972, + "learning_rate": 1.0505580215181517e-06, + "loss": 0.67544365, + "num_input_tokens_seen": 119502895, + "step": 5553, + "time_per_iteration": 2.7411813735961914 + }, + { + "auxiliary_loss_clip": 0.01087451, + "auxiliary_loss_mlp": 0.01079289, + "balance_loss_clip": 1.02697682, + "balance_loss_mlp": 1.00032473, + "epoch": 0.6678290146095112, + "flos": 70941315219840.0, + "grad_norm": 0.7870017641413519, + "language_loss": 0.56613594, + "learning_rate": 1.0498724922932753e-06, + "loss": 0.58780336, + "num_input_tokens_seen": 119561010, + "step": 5554, + "time_per_iteration": 3.3109116554260254 + }, + { + "auxiliary_loss_clip": 0.01137909, + "auxiliary_loss_mlp": 0.01085348, + "balance_loss_clip": 1.02778971, + "balance_loss_mlp": 1.00490558, + "epoch": 0.6679492575001503, + "flos": 18661088263680.0, + "grad_norm": 2.1429187108755334, + "language_loss": 0.86814785, + "learning_rate": 1.0491871072019851e-06, + "loss": 0.8903805, + "num_input_tokens_seen": 119578900, + "step": 5555, + "time_per_iteration": 2.5669801235198975 + }, + { + "auxiliary_loss_clip": 0.01108703, + "auxiliary_loss_mlp": 0.0108377, + "balance_loss_clip": 1.02306724, + "balance_loss_mlp": 1.00361419, + "epoch": 0.6680695003907894, + "flos": 29711985822720.0, + "grad_norm": 1.7104100259382478, + "language_loss": 0.63865227, + "learning_rate": 1.0485018663482555e-06, + "loss": 0.660577, + "num_input_tokens_seen": 119598920, + "step": 5556, + "time_per_iteration": 3.7092361450195312 + }, + { + "auxiliary_loss_clip": 0.01128447, + "auxiliary_loss_mlp": 0.01084132, + "balance_loss_clip": 1.02670574, + "balance_loss_mlp": 1.00388002, + "epoch": 0.6681897432814284, + "flos": 28218964083840.0, + "grad_norm": 2.3234649323322856, + "language_loss": 0.71298093, + "learning_rate": 1.0478167698360354e-06, + "loss": 0.73510671, + "num_input_tokens_seen": 119618220, + "step": 5557, + "time_per_iteration": 2.7546679973602295 + }, + { + "auxiliary_loss_clip": 0.01126376, + "auxiliary_loss_mlp": 0.0108452, + "balance_loss_clip": 1.02498209, + "balance_loss_mlp": 1.00426805, + "epoch": 0.6683099861720676, + "flos": 25046543911680.0, + "grad_norm": 1.9637904263918708, + "language_loss": 0.70108068, + "learning_rate": 1.0471318177692556e-06, + "loss": 0.72318965, + "num_input_tokens_seen": 119638520, + "step": 5558, + "time_per_iteration": 2.707704782485962 + }, + { + "auxiliary_loss_clip": 0.01097035, + "auxiliary_loss_mlp": 0.01084228, + "balance_loss_clip": 1.02242804, + "balance_loss_mlp": 1.00392866, + "epoch": 0.6684302290627067, + "flos": 22996977868800.0, + "grad_norm": 1.9714875253069808, + "language_loss": 0.75427008, + "learning_rate": 1.046447010251821e-06, + "loss": 0.77608275, + "num_input_tokens_seen": 119655850, + "step": 5559, + "time_per_iteration": 3.8136258125305176 + }, + { + "auxiliary_loss_clip": 0.01115397, + "auxiliary_loss_mlp": 0.01084091, + "balance_loss_clip": 1.02454698, + "balance_loss_mlp": 1.00402963, + "epoch": 0.6685504719533457, + "flos": 26573824247040.0, + "grad_norm": 1.638829426668597, + "language_loss": 0.75905019, + "learning_rate": 1.0457623473876157e-06, + "loss": 0.78104508, + "num_input_tokens_seen": 119675355, + "step": 5560, + "time_per_iteration": 2.8071818351745605 + }, + { + "auxiliary_loss_clip": 0.01134809, + "auxiliary_loss_mlp": 0.01083994, + "balance_loss_clip": 1.02526069, + "balance_loss_mlp": 1.00378978, + "epoch": 0.6686707148439849, + "flos": 28986087870720.0, + "grad_norm": 33.22535293683918, + "language_loss": 0.709144, + "learning_rate": 1.0450778292805046e-06, + "loss": 0.731332, + "num_input_tokens_seen": 119695340, + "step": 5561, + "time_per_iteration": 4.482053756713867 + }, + { + "auxiliary_loss_clip": 0.01127993, + "auxiliary_loss_mlp": 0.01085157, + "balance_loss_clip": 1.02625012, + "balance_loss_mlp": 1.00500083, + "epoch": 0.6687909577346239, + "flos": 23623152687360.0, + "grad_norm": 3.943312162947632, + "language_loss": 0.78837955, + "learning_rate": 1.0443934560343267e-06, + "loss": 0.81051105, + "num_input_tokens_seen": 119716750, + "step": 5562, + "time_per_iteration": 2.7181146144866943 + }, + { + "auxiliary_loss_clip": 0.01100293, + "auxiliary_loss_mlp": 0.01083546, + "balance_loss_clip": 1.02330101, + "balance_loss_mlp": 1.00329399, + "epoch": 0.668911200625263, + "flos": 23148593176320.0, + "grad_norm": 1.9182844531133707, + "language_loss": 0.77987242, + "learning_rate": 1.0437092277529034e-06, + "loss": 0.80171084, + "num_input_tokens_seen": 119736005, + "step": 5563, + "time_per_iteration": 2.815718412399292 + }, + { + "auxiliary_loss_clip": 0.01118041, + "auxiliary_loss_mlp": 0.0108361, + "balance_loss_clip": 1.02532673, + "balance_loss_mlp": 1.00345373, + "epoch": 0.6690314435159022, + "flos": 18551919853440.0, + "grad_norm": 2.011681531264421, + "language_loss": 0.73984504, + "learning_rate": 1.0430251445400292e-06, + "loss": 0.7618615, + "num_input_tokens_seen": 119754050, + "step": 5564, + "time_per_iteration": 2.6809473037719727 + }, + { + "auxiliary_loss_clip": 0.01061348, + "auxiliary_loss_mlp": 0.01084735, + "balance_loss_clip": 1.01923883, + "balance_loss_mlp": 1.0044837, + "epoch": 0.6691516864065412, + "flos": 31759540704000.0, + "grad_norm": 2.7514424602120453, + "language_loss": 0.62600422, + "learning_rate": 1.0423412064994787e-06, + "loss": 0.64746505, + "num_input_tokens_seen": 119774820, + "step": 5565, + "time_per_iteration": 3.0332157611846924 + }, + { + "auxiliary_loss_clip": 0.01107481, + "auxiliary_loss_mlp": 0.01083869, + "balance_loss_clip": 1.0238111, + "balance_loss_mlp": 1.00376081, + "epoch": 0.6692719292971803, + "flos": 34933864296960.0, + "grad_norm": 1.8564624986541711, + "language_loss": 0.73863715, + "learning_rate": 1.0416574137350064e-06, + "loss": 0.76055062, + "num_input_tokens_seen": 119795525, + "step": 5566, + "time_per_iteration": 3.06559419631958 + }, + { + "auxiliary_loss_clip": 0.01129072, + "auxiliary_loss_mlp": 0.01084341, + "balance_loss_clip": 1.02697432, + "balance_loss_mlp": 1.00408912, + "epoch": 0.6693921721878194, + "flos": 20449188230400.0, + "grad_norm": 2.0581940213609506, + "language_loss": 0.80844563, + "learning_rate": 1.0409737663503428e-06, + "loss": 0.83057976, + "num_input_tokens_seen": 119813905, + "step": 5567, + "time_per_iteration": 2.723137378692627 + }, + { + "auxiliary_loss_clip": 0.01126274, + "auxiliary_loss_mlp": 0.01084108, + "balance_loss_clip": 1.02416849, + "balance_loss_mlp": 1.00390387, + "epoch": 0.6695124150784585, + "flos": 16614538963200.0, + "grad_norm": 1.8029832451428927, + "language_loss": 0.83081329, + "learning_rate": 1.040290264449196e-06, + "loss": 0.85291713, + "num_input_tokens_seen": 119832010, + "step": 5568, + "time_per_iteration": 2.634462594985962 + }, + { + "auxiliary_loss_clip": 0.01125444, + "auxiliary_loss_mlp": 0.01083997, + "balance_loss_clip": 1.02608228, + "balance_loss_mlp": 1.00393653, + "epoch": 0.6696326579690975, + "flos": 26652145852800.0, + "grad_norm": 1.9326894943226849, + "language_loss": 0.63800591, + "learning_rate": 1.0396069081352532e-06, + "loss": 0.66010034, + "num_input_tokens_seen": 119851165, + "step": 5569, + "time_per_iteration": 2.7512030601501465 + }, + { + "auxiliary_loss_clip": 0.01117654, + "auxiliary_loss_mlp": 0.01078825, + "balance_loss_clip": 1.02215219, + "balance_loss_mlp": 0.99986124, + "epoch": 0.6697529008597367, + "flos": 66964603662720.0, + "grad_norm": 0.7752334365576157, + "language_loss": 0.56060803, + "learning_rate": 1.0389236975121782e-06, + "loss": 0.58257282, + "num_input_tokens_seen": 119906015, + "step": 5570, + "time_per_iteration": 3.159945249557495 + }, + { + "auxiliary_loss_clip": 0.01135501, + "auxiliary_loss_mlp": 0.01084227, + "balance_loss_clip": 1.02555203, + "balance_loss_mlp": 1.00392771, + "epoch": 0.6698731437503758, + "flos": 20886939279360.0, + "grad_norm": 4.308046021062095, + "language_loss": 0.71355283, + "learning_rate": 1.0382406326836147e-06, + "loss": 0.73575008, + "num_input_tokens_seen": 119925160, + "step": 5571, + "time_per_iteration": 2.6190927028656006 + }, + { + "auxiliary_loss_clip": 0.01126736, + "auxiliary_loss_mlp": 0.01084109, + "balance_loss_clip": 1.02510977, + "balance_loss_mlp": 1.00376201, + "epoch": 0.6699933866410148, + "flos": 20409470766720.0, + "grad_norm": 1.8563148822469464, + "language_loss": 0.76040781, + "learning_rate": 1.0375577137531828e-06, + "loss": 0.78251624, + "num_input_tokens_seen": 119943720, + "step": 5572, + "time_per_iteration": 2.7311043739318848 + }, + { + "auxiliary_loss_clip": 0.01115819, + "auxiliary_loss_mlp": 0.01084568, + "balance_loss_clip": 1.02330494, + "balance_loss_mlp": 1.00436401, + "epoch": 0.670113629531654, + "flos": 29023075900800.0, + "grad_norm": 1.6716807937152036, + "language_loss": 0.71861577, + "learning_rate": 1.0368749408244802e-06, + "loss": 0.74061966, + "num_input_tokens_seen": 119966640, + "step": 5573, + "time_per_iteration": 2.746577024459839 + }, + { + "auxiliary_loss_clip": 0.01128212, + "auxiliary_loss_mlp": 0.01084065, + "balance_loss_clip": 1.02671826, + "balance_loss_mlp": 1.00395632, + "epoch": 0.670233872422293, + "flos": 19791699730560.0, + "grad_norm": 3.152508901488574, + "language_loss": 0.7856164, + "learning_rate": 1.0361923140010836e-06, + "loss": 0.80773914, + "num_input_tokens_seen": 119985125, + "step": 5574, + "time_per_iteration": 2.7315750122070312 + }, + { + "auxiliary_loss_clip": 0.01128528, + "auxiliary_loss_mlp": 0.01083957, + "balance_loss_clip": 1.02641726, + "balance_loss_mlp": 1.00360978, + "epoch": 0.6703541153129321, + "flos": 24243689070720.0, + "grad_norm": 2.0077299916270346, + "language_loss": 0.63499069, + "learning_rate": 1.0355098333865455e-06, + "loss": 0.65711558, + "num_input_tokens_seen": 120004355, + "step": 5575, + "time_per_iteration": 2.6884801387786865 + }, + { + "auxiliary_loss_clip": 0.0112171, + "auxiliary_loss_mlp": 0.01084606, + "balance_loss_clip": 1.02300656, + "balance_loss_mlp": 1.00435448, + "epoch": 0.6704743582035713, + "flos": 26688523351680.0, + "grad_norm": 1.5537563415022586, + "language_loss": 0.69268566, + "learning_rate": 1.0348274990844006e-06, + "loss": 0.71474886, + "num_input_tokens_seen": 120027115, + "step": 5576, + "time_per_iteration": 2.7708473205566406 + }, + { + "auxiliary_loss_clip": 0.011251, + "auxiliary_loss_mlp": 0.01084378, + "balance_loss_clip": 1.02497447, + "balance_loss_mlp": 1.00426912, + "epoch": 0.6705946010942103, + "flos": 23514379326720.0, + "grad_norm": 4.602709775858744, + "language_loss": 0.7245394, + "learning_rate": 1.034145311198155e-06, + "loss": 0.74663419, + "num_input_tokens_seen": 120047130, + "step": 5577, + "time_per_iteration": 2.6226229667663574 + }, + { + "auxiliary_loss_clip": 0.01134665, + "auxiliary_loss_mlp": 0.01083827, + "balance_loss_clip": 1.02486658, + "balance_loss_mlp": 1.00371802, + "epoch": 0.6707148439848494, + "flos": 24061011477120.0, + "grad_norm": 1.5862269981124621, + "language_loss": 0.63748127, + "learning_rate": 1.0334632698312989e-06, + "loss": 0.65966618, + "num_input_tokens_seen": 120067925, + "step": 5578, + "time_per_iteration": 2.6718382835388184 + }, + { + "auxiliary_loss_clip": 0.0111746, + "auxiliary_loss_mlp": 0.01083486, + "balance_loss_clip": 1.02484322, + "balance_loss_mlp": 1.0033294, + "epoch": 0.6708350868754885, + "flos": 22528667324160.0, + "grad_norm": 1.851758767352783, + "language_loss": 0.74926496, + "learning_rate": 1.032781375087295e-06, + "loss": 0.77127439, + "num_input_tokens_seen": 120087825, + "step": 5579, + "time_per_iteration": 2.7272937297821045 + }, + { + "auxiliary_loss_clip": 0.01116729, + "auxiliary_loss_mlp": 0.01083634, + "balance_loss_clip": 1.02448905, + "balance_loss_mlp": 1.0035727, + "epoch": 0.6709553297661276, + "flos": 25227749047680.0, + "grad_norm": 1.4041530236488495, + "language_loss": 0.67448747, + "learning_rate": 1.0320996270695891e-06, + "loss": 0.69649112, + "num_input_tokens_seen": 120108895, + "step": 5580, + "time_per_iteration": 2.718503475189209 + }, + { + "auxiliary_loss_clip": 0.01110553, + "auxiliary_loss_mlp": 0.01083423, + "balance_loss_clip": 1.02509975, + "balance_loss_mlp": 1.00321877, + "epoch": 0.6710755726567667, + "flos": 20448757267200.0, + "grad_norm": 1.8734958291541477, + "language_loss": 0.73296893, + "learning_rate": 1.0314180258815998e-06, + "loss": 0.75490868, + "num_input_tokens_seen": 120127535, + "step": 5581, + "time_per_iteration": 3.6650235652923584 + }, + { + "auxiliary_loss_clip": 0.01107692, + "auxiliary_loss_mlp": 0.01084076, + "balance_loss_clip": 1.0240624, + "balance_loss_mlp": 1.00396705, + "epoch": 0.6711958155474057, + "flos": 25995411538560.0, + "grad_norm": 2.1590333998045934, + "language_loss": 0.7422024, + "learning_rate": 1.0307365716267247e-06, + "loss": 0.7641201, + "num_input_tokens_seen": 120147980, + "step": 5582, + "time_per_iteration": 2.7532858848571777 + }, + { + "auxiliary_loss_clip": 0.01124759, + "auxiliary_loss_mlp": 0.01083951, + "balance_loss_clip": 1.0243628, + "balance_loss_mlp": 1.00374746, + "epoch": 0.6713160584380449, + "flos": 19937712516480.0, + "grad_norm": 2.207365994317288, + "language_loss": 0.77651745, + "learning_rate": 1.0300552644083423e-06, + "loss": 0.79860461, + "num_input_tokens_seen": 120166905, + "step": 5583, + "time_per_iteration": 2.702702283859253 + }, + { + "auxiliary_loss_clip": 0.01103065, + "auxiliary_loss_mlp": 0.01083812, + "balance_loss_clip": 1.02087235, + "balance_loss_mlp": 1.00360847, + "epoch": 0.6714363013286839, + "flos": 18223373128320.0, + "grad_norm": 2.296525693668501, + "language_loss": 0.72187424, + "learning_rate": 1.0293741043298036e-06, + "loss": 0.74374306, + "num_input_tokens_seen": 120185255, + "step": 5584, + "time_per_iteration": 2.7139482498168945 + }, + { + "auxiliary_loss_clip": 0.01103063, + "auxiliary_loss_mlp": 0.01084845, + "balance_loss_clip": 1.02208018, + "balance_loss_mlp": 1.00454593, + "epoch": 0.671556544219323, + "flos": 25812374808960.0, + "grad_norm": 2.103654291177409, + "language_loss": 0.71300876, + "learning_rate": 1.0286930914944436e-06, + "loss": 0.73488784, + "num_input_tokens_seen": 120205070, + "step": 5585, + "time_per_iteration": 3.646216630935669 + }, + { + "auxiliary_loss_clip": 0.01134389, + "auxiliary_loss_mlp": 0.01084086, + "balance_loss_clip": 1.02409863, + "balance_loss_mlp": 1.00397778, + "epoch": 0.6716767871099621, + "flos": 15850431918720.0, + "grad_norm": 2.327889862650017, + "language_loss": 0.76920283, + "learning_rate": 1.0280122260055684e-06, + "loss": 0.79138756, + "num_input_tokens_seen": 120220780, + "step": 5586, + "time_per_iteration": 4.492345094680786 + }, + { + "auxiliary_loss_clip": 0.01136819, + "auxiliary_loss_mlp": 0.01084393, + "balance_loss_clip": 1.02678573, + "balance_loss_mlp": 1.00418854, + "epoch": 0.6717970300006012, + "flos": 19756112330880.0, + "grad_norm": 2.5233548684103773, + "language_loss": 0.82285023, + "learning_rate": 1.0273315079664652e-06, + "loss": 0.84506238, + "num_input_tokens_seen": 120238735, + "step": 5587, + "time_per_iteration": 2.6423027515411377 + }, + { + "auxiliary_loss_clip": 0.01127015, + "auxiliary_loss_mlp": 0.01084192, + "balance_loss_clip": 1.02568305, + "balance_loss_mlp": 1.00403595, + "epoch": 0.6719172728912403, + "flos": 25485049146240.0, + "grad_norm": 2.2746255325748197, + "language_loss": 0.74214518, + "learning_rate": 1.0266509374803992e-06, + "loss": 0.76425725, + "num_input_tokens_seen": 120259895, + "step": 5588, + "time_per_iteration": 2.726997137069702 + }, + { + "auxiliary_loss_clip": 0.01135501, + "auxiliary_loss_mlp": 0.00872823, + "balance_loss_clip": 1.0259676, + "balance_loss_mlp": 1.00007033, + "epoch": 0.6720375157818794, + "flos": 15880344969600.0, + "grad_norm": 3.623049508523337, + "language_loss": 0.84691596, + "learning_rate": 1.0259705146506123e-06, + "loss": 0.86699927, + "num_input_tokens_seen": 120274790, + "step": 5589, + "time_per_iteration": 2.6769943237304688 + }, + { + "auxiliary_loss_clip": 0.01126828, + "auxiliary_loss_mlp": 0.01083761, + "balance_loss_clip": 1.02518487, + "balance_loss_mlp": 1.00355697, + "epoch": 0.6721577586725185, + "flos": 32010843231360.0, + "grad_norm": 2.0628506031176252, + "language_loss": 0.77517575, + "learning_rate": 1.025290239580324e-06, + "loss": 0.79728162, + "num_input_tokens_seen": 120295460, + "step": 5590, + "time_per_iteration": 2.7346720695495605 + }, + { + "auxiliary_loss_clip": 0.01100409, + "auxiliary_loss_mlp": 0.01084246, + "balance_loss_clip": 1.02437854, + "balance_loss_mlp": 1.00399423, + "epoch": 0.6722780015631575, + "flos": 20737873837440.0, + "grad_norm": 1.669179178872336, + "language_loss": 0.75299191, + "learning_rate": 1.0246101123727313e-06, + "loss": 0.77483845, + "num_input_tokens_seen": 120314440, + "step": 5591, + "time_per_iteration": 2.893235206604004 + }, + { + "auxiliary_loss_clip": 0.01128342, + "auxiliary_loss_mlp": 0.01085455, + "balance_loss_clip": 1.02659655, + "balance_loss_mlp": 1.0053463, + "epoch": 0.6723982444537967, + "flos": 16909617191040.0, + "grad_norm": 1.9812585322287928, + "language_loss": 0.78886676, + "learning_rate": 1.0239301331310085e-06, + "loss": 0.81100476, + "num_input_tokens_seen": 120332060, + "step": 5592, + "time_per_iteration": 2.750157356262207 + }, + { + "auxiliary_loss_clip": 0.011256, + "auxiliary_loss_mlp": 0.01084855, + "balance_loss_clip": 1.02516508, + "balance_loss_mlp": 1.00465143, + "epoch": 0.6725184873444358, + "flos": 20667812359680.0, + "grad_norm": 1.5011424048481103, + "language_loss": 0.88360524, + "learning_rate": 1.0232503019583088e-06, + "loss": 0.90570986, + "num_input_tokens_seen": 120351670, + "step": 5593, + "time_per_iteration": 2.7142210006713867 + }, + { + "auxiliary_loss_clip": 0.01124896, + "auxiliary_loss_mlp": 0.01085386, + "balance_loss_clip": 1.02443659, + "balance_loss_mlp": 1.00508642, + "epoch": 0.6726387302350748, + "flos": 23727616416000.0, + "grad_norm": 1.7727233033993335, + "language_loss": 0.69639611, + "learning_rate": 1.0225706189577619e-06, + "loss": 0.71849895, + "num_input_tokens_seen": 120370195, + "step": 5594, + "time_per_iteration": 2.636598587036133 + }, + { + "auxiliary_loss_clip": 0.01126586, + "auxiliary_loss_mlp": 0.01083403, + "balance_loss_clip": 1.0257585, + "balance_loss_mlp": 1.00310361, + "epoch": 0.672758973125714, + "flos": 15188274650880.0, + "grad_norm": 2.0309140732703583, + "language_loss": 0.74980843, + "learning_rate": 1.021891084232475e-06, + "loss": 0.77190828, + "num_input_tokens_seen": 120388130, + "step": 5595, + "time_per_iteration": 2.619230270385742 + }, + { + "auxiliary_loss_clip": 0.01127655, + "auxiliary_loss_mlp": 0.01083409, + "balance_loss_clip": 1.02585232, + "balance_loss_mlp": 1.00310946, + "epoch": 0.672879216016353, + "flos": 18077252601600.0, + "grad_norm": 3.385627813704802, + "language_loss": 0.80453402, + "learning_rate": 1.0212116978855325e-06, + "loss": 0.8266446, + "num_input_tokens_seen": 120406145, + "step": 5596, + "time_per_iteration": 2.5950095653533936 + }, + { + "auxiliary_loss_clip": 0.01104447, + "auxiliary_loss_mlp": 0.01083278, + "balance_loss_clip": 1.02288401, + "balance_loss_mlp": 1.00312173, + "epoch": 0.6729994589069921, + "flos": 23476349802240.0, + "grad_norm": 1.6321220815531428, + "language_loss": 0.78718221, + "learning_rate": 1.020532460019997e-06, + "loss": 0.8090595, + "num_input_tokens_seen": 120425395, + "step": 5597, + "time_per_iteration": 2.762511730194092 + }, + { + "auxiliary_loss_clip": 0.01073035, + "auxiliary_loss_mlp": 0.01084451, + "balance_loss_clip": 1.0211935, + "balance_loss_mlp": 1.00424707, + "epoch": 0.6731197017976313, + "flos": 26322018929280.0, + "grad_norm": 1.645671072006552, + "language_loss": 0.70620888, + "learning_rate": 1.0198533707389096e-06, + "loss": 0.72778374, + "num_input_tokens_seen": 120446270, + "step": 5598, + "time_per_iteration": 3.044689893722534 + }, + { + "auxiliary_loss_clip": 0.01125298, + "auxiliary_loss_mlp": 0.00872975, + "balance_loss_clip": 1.02464581, + "balance_loss_mlp": 1.00007451, + "epoch": 0.6732399446882703, + "flos": 21616428591360.0, + "grad_norm": 1.7116194111983987, + "language_loss": 0.73275596, + "learning_rate": 1.0191744301452853e-06, + "loss": 0.75273871, + "num_input_tokens_seen": 120465570, + "step": 5599, + "time_per_iteration": 3.0256361961364746 + }, + { + "auxiliary_loss_clip": 0.01135886, + "auxiliary_loss_mlp": 0.01083968, + "balance_loss_clip": 1.02593994, + "balance_loss_mlp": 1.00381136, + "epoch": 0.6733601875789094, + "flos": 25880173729920.0, + "grad_norm": 2.967584297168704, + "language_loss": 0.70096874, + "learning_rate": 1.0184956383421208e-06, + "loss": 0.7231673, + "num_input_tokens_seen": 120484220, + "step": 5600, + "time_per_iteration": 2.7391226291656494 + }, + { + "auxiliary_loss_clip": 0.01127668, + "auxiliary_loss_mlp": 0.0108592, + "balance_loss_clip": 1.02620435, + "balance_loss_mlp": 1.00576377, + "epoch": 0.6734804304695485, + "flos": 22929573997440.0, + "grad_norm": 2.128028031756388, + "language_loss": 0.65218186, + "learning_rate": 1.017816995432387e-06, + "loss": 0.67431784, + "num_input_tokens_seen": 120503320, + "step": 5601, + "time_per_iteration": 2.6890408992767334 + }, + { + "auxiliary_loss_clip": 0.0111795, + "auxiliary_loss_mlp": 0.01083877, + "balance_loss_clip": 1.02527618, + "balance_loss_mlp": 1.0037688, + "epoch": 0.6736006733601876, + "flos": 18697968552960.0, + "grad_norm": 2.2488185700766774, + "language_loss": 0.74567282, + "learning_rate": 1.0171385015190353e-06, + "loss": 0.76769114, + "num_input_tokens_seen": 120523180, + "step": 5602, + "time_per_iteration": 2.773393154144287 + }, + { + "auxiliary_loss_clip": 0.0111007, + "auxiliary_loss_mlp": 0.00872838, + "balance_loss_clip": 1.02089858, + "balance_loss_mlp": 1.00008559, + "epoch": 0.6737209162508266, + "flos": 19427745173760.0, + "grad_norm": 2.1637922277878503, + "language_loss": 0.73170102, + "learning_rate": 1.0164601567049908e-06, + "loss": 0.75153011, + "num_input_tokens_seen": 120541710, + "step": 5603, + "time_per_iteration": 2.725351572036743 + }, + { + "auxiliary_loss_clip": 0.011185, + "auxiliary_loss_mlp": 0.01085015, + "balance_loss_clip": 1.02500665, + "balance_loss_mlp": 1.00485826, + "epoch": 0.6738411591414658, + "flos": 20158060498560.0, + "grad_norm": 1.6901197389935056, + "language_loss": 0.8072992, + "learning_rate": 1.015781961093158e-06, + "loss": 0.82933438, + "num_input_tokens_seen": 120561030, + "step": 5604, + "time_per_iteration": 2.7508487701416016 + }, + { + "auxiliary_loss_clip": 0.01117819, + "auxiliary_loss_mlp": 0.0108564, + "balance_loss_clip": 1.02387428, + "balance_loss_mlp": 1.00548339, + "epoch": 0.6739614020321049, + "flos": 21653847584640.0, + "grad_norm": 1.4691953768406962, + "language_loss": 0.77253288, + "learning_rate": 1.0151039147864197e-06, + "loss": 0.79456753, + "num_input_tokens_seen": 120581005, + "step": 5605, + "time_per_iteration": 2.673962116241455 + }, + { + "auxiliary_loss_clip": 0.01065764, + "auxiliary_loss_mlp": 0.01084795, + "balance_loss_clip": 1.01830423, + "balance_loss_mlp": 1.0045433, + "epoch": 0.6740816449227439, + "flos": 19171702051200.0, + "grad_norm": 2.0185501425379595, + "language_loss": 0.65853053, + "learning_rate": 1.0144260178876336e-06, + "loss": 0.68003607, + "num_input_tokens_seen": 120600350, + "step": 5606, + "time_per_iteration": 3.993075370788574 + }, + { + "auxiliary_loss_clip": 0.01118714, + "auxiliary_loss_mlp": 0.01083797, + "balance_loss_clip": 1.02528512, + "balance_loss_mlp": 1.00364065, + "epoch": 0.6742018878133831, + "flos": 21097015971840.0, + "grad_norm": 2.1831324785216193, + "language_loss": 0.67345238, + "learning_rate": 1.0137482704996388e-06, + "loss": 0.69547749, + "num_input_tokens_seen": 120614700, + "step": 5607, + "time_per_iteration": 2.7217283248901367 + }, + { + "auxiliary_loss_clip": 0.01108591, + "auxiliary_loss_mlp": 0.01085414, + "balance_loss_clip": 1.02494621, + "balance_loss_mlp": 1.00511432, + "epoch": 0.6743221307040221, + "flos": 23549966726400.0, + "grad_norm": 2.377671207510074, + "language_loss": 0.78801334, + "learning_rate": 1.0130706727252461e-06, + "loss": 0.80995345, + "num_input_tokens_seen": 120631755, + "step": 5608, + "time_per_iteration": 2.7419865131378174 + }, + { + "auxiliary_loss_clip": 0.01107716, + "auxiliary_loss_mlp": 0.01084122, + "balance_loss_clip": 1.02375698, + "balance_loss_mlp": 1.00391793, + "epoch": 0.6744423735946612, + "flos": 16249542912000.0, + "grad_norm": 2.141663683889983, + "language_loss": 0.67885166, + "learning_rate": 1.0123932246672468e-06, + "loss": 0.70077008, + "num_input_tokens_seen": 120645900, + "step": 5609, + "time_per_iteration": 3.623281478881836 + }, + { + "auxiliary_loss_clip": 0.01084284, + "auxiliary_loss_mlp": 0.00873164, + "balance_loss_clip": 1.02203906, + "balance_loss_mlp": 1.00169301, + "epoch": 0.6745626164853004, + "flos": 57843257829120.0, + "grad_norm": 0.7455296177120014, + "language_loss": 0.5581637, + "learning_rate": 1.0117159264284114e-06, + "loss": 0.57773817, + "num_input_tokens_seen": 120709070, + "step": 5610, + "time_per_iteration": 4.251312732696533 + }, + { + "auxiliary_loss_clip": 0.01116477, + "auxiliary_loss_mlp": 0.01085849, + "balance_loss_clip": 1.02429843, + "balance_loss_mlp": 1.00559723, + "epoch": 0.6746828593759394, + "flos": 20485027025280.0, + "grad_norm": 1.5551262857753678, + "language_loss": 0.76789171, + "learning_rate": 1.0110387781114837e-06, + "loss": 0.78991497, + "num_input_tokens_seen": 120727685, + "step": 5611, + "time_per_iteration": 2.702314853668213 + }, + { + "auxiliary_loss_clip": 0.01135694, + "auxiliary_loss_mlp": 0.01084358, + "balance_loss_clip": 1.02601349, + "balance_loss_mlp": 1.00405872, + "epoch": 0.6748031022665785, + "flos": 19208223204480.0, + "grad_norm": 2.0404099596687897, + "language_loss": 0.7744422, + "learning_rate": 1.0103617798191872e-06, + "loss": 0.79664278, + "num_input_tokens_seen": 120747160, + "step": 5612, + "time_per_iteration": 3.4954111576080322 + }, + { + "auxiliary_loss_clip": 0.01111274, + "auxiliary_loss_mlp": 0.01084023, + "balance_loss_clip": 1.02222776, + "balance_loss_mlp": 1.00381899, + "epoch": 0.6749233451572175, + "flos": 15195026407680.0, + "grad_norm": 2.1479633296791154, + "language_loss": 0.82721215, + "learning_rate": 1.0096849316542217e-06, + "loss": 0.84916508, + "num_input_tokens_seen": 120763710, + "step": 5613, + "time_per_iteration": 2.7083020210266113 + }, + { + "auxiliary_loss_clip": 0.01078805, + "auxiliary_loss_mlp": 0.01084055, + "balance_loss_clip": 1.02118683, + "balance_loss_mlp": 1.00370812, + "epoch": 0.6750435880478567, + "flos": 26499489050880.0, + "grad_norm": 2.2204602044883703, + "language_loss": 0.7465449, + "learning_rate": 1.0090082337192643e-06, + "loss": 0.76817346, + "num_input_tokens_seen": 120783355, + "step": 5614, + "time_per_iteration": 2.88816499710083 + }, + { + "auxiliary_loss_clip": 0.01088864, + "auxiliary_loss_mlp": 0.01084076, + "balance_loss_clip": 1.02232218, + "balance_loss_mlp": 1.00391924, + "epoch": 0.6751638309384957, + "flos": 23404313076480.0, + "grad_norm": 2.1271358544466996, + "language_loss": 0.78058302, + "learning_rate": 1.0083316861169705e-06, + "loss": 0.80231237, + "num_input_tokens_seen": 120802090, + "step": 5615, + "time_per_iteration": 2.878973960876465 + }, + { + "auxiliary_loss_clip": 0.01107805, + "auxiliary_loss_mlp": 0.01083553, + "balance_loss_clip": 1.0235101, + "balance_loss_mlp": 1.00330126, + "epoch": 0.6752840738291348, + "flos": 23441408847360.0, + "grad_norm": 2.643617143150777, + "language_loss": 0.71951675, + "learning_rate": 1.0076552889499713e-06, + "loss": 0.74143028, + "num_input_tokens_seen": 120822855, + "step": 5616, + "time_per_iteration": 2.8310203552246094 + }, + { + "auxiliary_loss_clip": 0.01126058, + "auxiliary_loss_mlp": 0.010853, + "balance_loss_clip": 1.02577043, + "balance_loss_mlp": 1.00519145, + "epoch": 0.675404316719774, + "flos": 30335826257280.0, + "grad_norm": 2.5640414050965656, + "language_loss": 0.73876053, + "learning_rate": 1.006979042320876e-06, + "loss": 0.76087415, + "num_input_tokens_seen": 120843070, + "step": 5617, + "time_per_iteration": 2.71877384185791 + }, + { + "auxiliary_loss_clip": 0.01117733, + "auxiliary_loss_mlp": 0.01083438, + "balance_loss_clip": 1.02425003, + "balance_loss_mlp": 1.00332904, + "epoch": 0.675524559610413, + "flos": 23622613983360.0, + "grad_norm": 1.86158417867398, + "language_loss": 0.63071072, + "learning_rate": 1.0063029463322702e-06, + "loss": 0.65272248, + "num_input_tokens_seen": 120863345, + "step": 5618, + "time_per_iteration": 2.737274408340454 + }, + { + "auxiliary_loss_clip": 0.01091336, + "auxiliary_loss_mlp": 0.00872851, + "balance_loss_clip": 1.02172399, + "balance_loss_mlp": 1.00012112, + "epoch": 0.6756448025010521, + "flos": 21248631279360.0, + "grad_norm": 2.2692218504337323, + "language_loss": 0.75078845, + "learning_rate": 1.0056270010867164e-06, + "loss": 0.77043033, + "num_input_tokens_seen": 120880915, + "step": 5619, + "time_per_iteration": 2.8373780250549316 + }, + { + "auxiliary_loss_clip": 0.0111942, + "auxiliary_loss_mlp": 0.01084192, + "balance_loss_clip": 1.02543688, + "balance_loss_mlp": 1.00379765, + "epoch": 0.6757650453916912, + "flos": 21646521210240.0, + "grad_norm": 2.1160918655367613, + "language_loss": 0.78616923, + "learning_rate": 1.004951206686758e-06, + "loss": 0.80820537, + "num_input_tokens_seen": 120899190, + "step": 5620, + "time_per_iteration": 2.763937473297119 + }, + { + "auxiliary_loss_clip": 0.01126618, + "auxiliary_loss_mlp": 0.01083754, + "balance_loss_clip": 1.02537251, + "balance_loss_mlp": 1.00345445, + "epoch": 0.6758852882823303, + "flos": 21795658479360.0, + "grad_norm": 1.7674600661767002, + "language_loss": 0.71518272, + "learning_rate": 1.0042755632349087e-06, + "loss": 0.73728645, + "num_input_tokens_seen": 120916080, + "step": 5621, + "time_per_iteration": 2.6334896087646484 + }, + { + "auxiliary_loss_clip": 0.0110713, + "auxiliary_loss_mlp": 0.01084209, + "balance_loss_clip": 1.02397704, + "balance_loss_mlp": 1.00395775, + "epoch": 0.6760055311729694, + "flos": 27088783580160.0, + "grad_norm": 1.9672359526373935, + "language_loss": 0.62684029, + "learning_rate": 1.0036000708336653e-06, + "loss": 0.64875364, + "num_input_tokens_seen": 120935210, + "step": 5622, + "time_per_iteration": 2.8326942920684814 + }, + { + "auxiliary_loss_clip": 0.01118293, + "auxiliary_loss_mlp": 0.01084406, + "balance_loss_clip": 1.02596569, + "balance_loss_mlp": 1.00415444, + "epoch": 0.6761257740636085, + "flos": 17999792922240.0, + "grad_norm": 1.9123060262490459, + "language_loss": 0.79419732, + "learning_rate": 1.0029247295854984e-06, + "loss": 0.81622434, + "num_input_tokens_seen": 120951830, + "step": 5623, + "time_per_iteration": 2.6797802448272705 + }, + { + "auxiliary_loss_clip": 0.0110881, + "auxiliary_loss_mlp": 0.01085196, + "balance_loss_clip": 1.02433443, + "balance_loss_mlp": 1.00503945, + "epoch": 0.6762460169542476, + "flos": 15121912273920.0, + "grad_norm": 1.733672532540547, + "language_loss": 0.71721458, + "learning_rate": 1.0022495395928588e-06, + "loss": 0.73915464, + "num_input_tokens_seen": 120970310, + "step": 5624, + "time_per_iteration": 2.7734482288360596 + }, + { + "auxiliary_loss_clip": 0.01117674, + "auxiliary_loss_mlp": 0.01078961, + "balance_loss_clip": 1.02213669, + "balance_loss_mlp": 0.9999966, + "epoch": 0.6763662598448866, + "flos": 67886970030720.0, + "grad_norm": 0.7891498792752808, + "language_loss": 0.62428689, + "learning_rate": 1.0015745009581697e-06, + "loss": 0.64625323, + "num_input_tokens_seen": 121031915, + "step": 5625, + "time_per_iteration": 3.2585837841033936 + }, + { + "auxiliary_loss_clip": 0.01120896, + "auxiliary_loss_mlp": 0.01084863, + "balance_loss_clip": 1.02570593, + "balance_loss_mlp": 1.00465918, + "epoch": 0.6764865027355258, + "flos": 20631829910400.0, + "grad_norm": 16.562941303565708, + "language_loss": 0.67082381, + "learning_rate": 1.0008996137838343e-06, + "loss": 0.69288141, + "num_input_tokens_seen": 121050890, + "step": 5626, + "time_per_iteration": 2.675954580307007 + }, + { + "auxiliary_loss_clip": 0.01137053, + "auxiliary_loss_mlp": 0.01084523, + "balance_loss_clip": 1.02702403, + "balance_loss_mlp": 1.00408101, + "epoch": 0.6766067456261649, + "flos": 21215809226880.0, + "grad_norm": 7.208730109285156, + "language_loss": 0.80092317, + "learning_rate": 1.000224878172234e-06, + "loss": 0.82313895, + "num_input_tokens_seen": 121070015, + "step": 5627, + "time_per_iteration": 2.5975773334503174 + }, + { + "auxiliary_loss_clip": 0.01110558, + "auxiliary_loss_mlp": 0.01084932, + "balance_loss_clip": 1.02594781, + "balance_loss_mlp": 1.00482309, + "epoch": 0.6767269885168039, + "flos": 19938251220480.0, + "grad_norm": 1.8586693273747186, + "language_loss": 0.72872669, + "learning_rate": 9.99550294225724e-07, + "loss": 0.75068158, + "num_input_tokens_seen": 121089170, + "step": 5628, + "time_per_iteration": 2.7098278999328613 + }, + { + "auxiliary_loss_clip": 0.01094965, + "auxiliary_loss_mlp": 0.01083925, + "balance_loss_clip": 1.02008009, + "balance_loss_mlp": 1.00372112, + "epoch": 0.6768472314074431, + "flos": 20814076540800.0, + "grad_norm": 1.7607604118497016, + "language_loss": 0.72304618, + "learning_rate": 9.988758620466402e-07, + "loss": 0.74483508, + "num_input_tokens_seen": 121108040, + "step": 5629, + "time_per_iteration": 2.8392107486724854 + }, + { + "auxiliary_loss_clip": 0.01074171, + "auxiliary_loss_mlp": 0.01083338, + "balance_loss_clip": 1.02399564, + "balance_loss_mlp": 1.00332487, + "epoch": 0.6769674742980821, + "flos": 23186012169600.0, + "grad_norm": 1.523165614042023, + "language_loss": 0.76264095, + "learning_rate": 9.982015817372917e-07, + "loss": 0.78421605, + "num_input_tokens_seen": 121128480, + "step": 5630, + "time_per_iteration": 2.885986089706421 + }, + { + "auxiliary_loss_clip": 0.01100529, + "auxiliary_loss_mlp": 0.01084267, + "balance_loss_clip": 1.02443504, + "balance_loss_mlp": 1.00406289, + "epoch": 0.6770877171887212, + "flos": 24242934885120.0, + "grad_norm": 1.6718579597389644, + "language_loss": 0.82070744, + "learning_rate": 9.975274533999657e-07, + "loss": 0.8425554, + "num_input_tokens_seen": 121148010, + "step": 5631, + "time_per_iteration": 2.849548101425171 + }, + { + "auxiliary_loss_clip": 0.01134688, + "auxiliary_loss_mlp": 0.01083866, + "balance_loss_clip": 1.02490139, + "balance_loss_mlp": 1.00361466, + "epoch": 0.6772079600793603, + "flos": 18141567903360.0, + "grad_norm": 2.4823040154706124, + "language_loss": 0.83819318, + "learning_rate": 9.96853477136929e-07, + "loss": 0.86037874, + "num_input_tokens_seen": 121162755, + "step": 5632, + "time_per_iteration": 3.4342100620269775 + }, + { + "auxiliary_loss_clip": 0.01104142, + "auxiliary_loss_mlp": 0.01083811, + "balance_loss_clip": 1.02511764, + "balance_loss_mlp": 1.00360668, + "epoch": 0.6773282029699994, + "flos": 22452069571200.0, + "grad_norm": 1.981629462299035, + "language_loss": 0.75093579, + "learning_rate": 9.96179653050422e-07, + "loss": 0.77281535, + "num_input_tokens_seen": 121182915, + "step": 5633, + "time_per_iteration": 2.8538877964019775 + }, + { + "auxiliary_loss_clip": 0.01106897, + "auxiliary_loss_mlp": 0.01082991, + "balance_loss_clip": 1.02304327, + "balance_loss_mlp": 1.00278735, + "epoch": 0.6774484458606385, + "flos": 18693730748160.0, + "grad_norm": 1.8850562973336185, + "language_loss": 0.74140596, + "learning_rate": 9.955059812426635e-07, + "loss": 0.76330489, + "num_input_tokens_seen": 121200445, + "step": 5634, + "time_per_iteration": 2.7428665161132812 + }, + { + "auxiliary_loss_clip": 0.01136968, + "auxiliary_loss_mlp": 0.01085833, + "balance_loss_clip": 1.02730298, + "balance_loss_mlp": 1.00558138, + "epoch": 0.6775686887512776, + "flos": 25994046821760.0, + "grad_norm": 2.1071728087425567, + "language_loss": 0.82891715, + "learning_rate": 9.948324618158493e-07, + "loss": 0.85114515, + "num_input_tokens_seen": 121220785, + "step": 5635, + "time_per_iteration": 3.6647908687591553 + }, + { + "auxiliary_loss_clip": 0.01126827, + "auxiliary_loss_mlp": 0.01084226, + "balance_loss_clip": 1.02472353, + "balance_loss_mlp": 1.00406969, + "epoch": 0.6776889316419167, + "flos": 13587987922560.0, + "grad_norm": 2.2428424165326533, + "language_loss": 0.77200878, + "learning_rate": 9.941590948721502e-07, + "loss": 0.7941193, + "num_input_tokens_seen": 121237985, + "step": 5636, + "time_per_iteration": 3.6510815620422363 + }, + { + "auxiliary_loss_clip": 0.01108897, + "auxiliary_loss_mlp": 0.01085477, + "balance_loss_clip": 1.02392495, + "balance_loss_mlp": 1.00532079, + "epoch": 0.6778091745325557, + "flos": 27601121220480.0, + "grad_norm": 1.6218211496367654, + "language_loss": 0.76468897, + "learning_rate": 9.934858805137188e-07, + "loss": 0.78663278, + "num_input_tokens_seen": 121258635, + "step": 5637, + "time_per_iteration": 3.704676389694214 + }, + { + "auxiliary_loss_clip": 0.01124971, + "auxiliary_loss_mlp": 0.01083501, + "balance_loss_clip": 1.02497816, + "balance_loss_mlp": 1.00339198, + "epoch": 0.6779294174231949, + "flos": 18734058743040.0, + "grad_norm": 1.5951985942928053, + "language_loss": 0.80886322, + "learning_rate": 9.92812818842677e-07, + "loss": 0.83094788, + "num_input_tokens_seen": 121277810, + "step": 5638, + "time_per_iteration": 2.631842851638794 + }, + { + "auxiliary_loss_clip": 0.01119812, + "auxiliary_loss_mlp": 0.01084025, + "balance_loss_clip": 1.02089131, + "balance_loss_mlp": 1.00386882, + "epoch": 0.678049660313834, + "flos": 45873797765760.0, + "grad_norm": 1.8302173542073945, + "language_loss": 0.6408633, + "learning_rate": 9.921399099611306e-07, + "loss": 0.66290164, + "num_input_tokens_seen": 121298975, + "step": 5639, + "time_per_iteration": 2.8816659450531006 + }, + { + "auxiliary_loss_clip": 0.01118408, + "auxiliary_loss_mlp": 0.01084743, + "balance_loss_clip": 1.02524829, + "balance_loss_mlp": 1.00468183, + "epoch": 0.678169903204473, + "flos": 19974556892160.0, + "grad_norm": 1.524960513095452, + "language_loss": 0.69006848, + "learning_rate": 9.914671539711588e-07, + "loss": 0.71210003, + "num_input_tokens_seen": 121318495, + "step": 5640, + "time_per_iteration": 2.6768336296081543 + }, + { + "auxiliary_loss_clip": 0.01061088, + "auxiliary_loss_mlp": 0.00872918, + "balance_loss_clip": 1.01986098, + "balance_loss_mlp": 1.00003469, + "epoch": 0.6782901460951122, + "flos": 21395613732480.0, + "grad_norm": 2.0190392387076237, + "language_loss": 0.78303909, + "learning_rate": 9.90794550974817e-07, + "loss": 0.80237913, + "num_input_tokens_seen": 121338890, + "step": 5641, + "time_per_iteration": 2.972440481185913 + }, + { + "auxiliary_loss_clip": 0.01107555, + "auxiliary_loss_mlp": 0.01084614, + "balance_loss_clip": 1.02346873, + "balance_loss_mlp": 1.00445747, + "epoch": 0.6784103889857512, + "flos": 21434002392960.0, + "grad_norm": 1.9688758030739417, + "language_loss": 0.81327611, + "learning_rate": 9.901221010741407e-07, + "loss": 0.83519781, + "num_input_tokens_seen": 121358210, + "step": 5642, + "time_per_iteration": 2.7760441303253174 + }, + { + "auxiliary_loss_clip": 0.01128316, + "auxiliary_loss_mlp": 0.01084219, + "balance_loss_clip": 1.02677596, + "balance_loss_mlp": 1.00406229, + "epoch": 0.6785306318763903, + "flos": 32671923091200.0, + "grad_norm": 1.7276190240136997, + "language_loss": 0.747711, + "learning_rate": 9.894498043711375e-07, + "loss": 0.76983637, + "num_input_tokens_seen": 121379955, + "step": 5643, + "time_per_iteration": 2.8077831268310547 + }, + { + "auxiliary_loss_clip": 0.01116034, + "auxiliary_loss_mlp": 0.01083528, + "balance_loss_clip": 1.02417898, + "balance_loss_mlp": 1.00327682, + "epoch": 0.6786508747670293, + "flos": 25632139340160.0, + "grad_norm": 1.8879224360959808, + "language_loss": 0.69276714, + "learning_rate": 9.887776609677962e-07, + "loss": 0.71476275, + "num_input_tokens_seen": 121401325, + "step": 5644, + "time_per_iteration": 2.772841453552246 + }, + { + "auxiliary_loss_clip": 0.01108932, + "auxiliary_loss_mlp": 0.0108376, + "balance_loss_clip": 1.02368152, + "balance_loss_mlp": 1.00360322, + "epoch": 0.6787711176576685, + "flos": 19171881619200.0, + "grad_norm": 1.5754985771283647, + "language_loss": 0.72288561, + "learning_rate": 9.88105670966079e-07, + "loss": 0.74481249, + "num_input_tokens_seen": 121419785, + "step": 5645, + "time_per_iteration": 2.743443012237549 + }, + { + "auxiliary_loss_clip": 0.01096046, + "auxiliary_loss_mlp": 0.01084039, + "balance_loss_clip": 1.02200055, + "balance_loss_mlp": 1.00378776, + "epoch": 0.6788913605483076, + "flos": 13985159581440.0, + "grad_norm": 1.9583841257037646, + "language_loss": 0.78885198, + "learning_rate": 9.874338344679283e-07, + "loss": 0.81065285, + "num_input_tokens_seen": 121435630, + "step": 5646, + "time_per_iteration": 2.7824113368988037 + }, + { + "auxiliary_loss_clip": 0.01134678, + "auxiliary_loss_mlp": 0.01084183, + "balance_loss_clip": 1.02564979, + "balance_loss_mlp": 1.00402713, + "epoch": 0.6790116034389466, + "flos": 22017586659840.0, + "grad_norm": 1.57936444469701, + "language_loss": 0.73889875, + "learning_rate": 9.86762151575259e-07, + "loss": 0.76108742, + "num_input_tokens_seen": 121455625, + "step": 5647, + "time_per_iteration": 2.6363790035247803 + }, + { + "auxiliary_loss_clip": 0.0108102, + "auxiliary_loss_mlp": 0.00872782, + "balance_loss_clip": 1.02335095, + "balance_loss_mlp": 1.00012088, + "epoch": 0.6791318463295858, + "flos": 20922454851840.0, + "grad_norm": 1.4298438705744503, + "language_loss": 0.80303031, + "learning_rate": 9.860906223899651e-07, + "loss": 0.8225683, + "num_input_tokens_seen": 121475020, + "step": 5648, + "time_per_iteration": 2.814378261566162 + }, + { + "auxiliary_loss_clip": 0.01119689, + "auxiliary_loss_mlp": 0.01084184, + "balance_loss_clip": 1.02569032, + "balance_loss_mlp": 1.00407529, + "epoch": 0.6792520892202248, + "flos": 28512749422080.0, + "grad_norm": 1.6018859309189748, + "language_loss": 0.75652564, + "learning_rate": 9.854192470139184e-07, + "loss": 0.77856445, + "num_input_tokens_seen": 121496500, + "step": 5649, + "time_per_iteration": 2.758044481277466 + }, + { + "auxiliary_loss_clip": 0.01115551, + "auxiliary_loss_mlp": 0.01084515, + "balance_loss_clip": 1.02439785, + "balance_loss_mlp": 1.00435889, + "epoch": 0.6793723321108639, + "flos": 20011904058240.0, + "grad_norm": 2.1420717708969548, + "language_loss": 0.71553659, + "learning_rate": 9.847480255489645e-07, + "loss": 0.73753721, + "num_input_tokens_seen": 121515525, + "step": 5650, + "time_per_iteration": 2.725576877593994 + }, + { + "auxiliary_loss_clip": 0.0111813, + "auxiliary_loss_mlp": 0.01084512, + "balance_loss_clip": 1.02491689, + "balance_loss_mlp": 1.00435603, + "epoch": 0.6794925750015031, + "flos": 26649488246400.0, + "grad_norm": 1.6727048905981996, + "language_loss": 0.68905735, + "learning_rate": 9.840769580969295e-07, + "loss": 0.71108377, + "num_input_tokens_seen": 121535965, + "step": 5651, + "time_per_iteration": 2.720813035964966 + }, + { + "auxiliary_loss_clip": 0.01127409, + "auxiliary_loss_mlp": 0.01083965, + "balance_loss_clip": 1.02576828, + "balance_loss_mlp": 1.00376081, + "epoch": 0.6796128178921421, + "flos": 21580374314880.0, + "grad_norm": 2.0017509081728493, + "language_loss": 0.79945099, + "learning_rate": 9.834060447596114e-07, + "loss": 0.82156473, + "num_input_tokens_seen": 121555235, + "step": 5652, + "time_per_iteration": 2.692019462585449 + }, + { + "auxiliary_loss_clip": 0.01126497, + "auxiliary_loss_mlp": 0.01084928, + "balance_loss_clip": 1.02530169, + "balance_loss_mlp": 1.00472426, + "epoch": 0.6797330607827812, + "flos": 22492002516480.0, + "grad_norm": 1.6491566376227882, + "language_loss": 0.77885962, + "learning_rate": 9.827352856387868e-07, + "loss": 0.80097389, + "num_input_tokens_seen": 121574945, + "step": 5653, + "time_per_iteration": 2.6972858905792236 + }, + { + "auxiliary_loss_clip": 0.01081297, + "auxiliary_loss_mlp": 0.01079573, + "balance_loss_clip": 1.01881897, + "balance_loss_mlp": 1.00060928, + "epoch": 0.6798533036734203, + "flos": 66306648286080.0, + "grad_norm": 0.793728627463002, + "language_loss": 0.64291275, + "learning_rate": 9.820646808362118e-07, + "loss": 0.66452146, + "num_input_tokens_seen": 121641200, + "step": 5654, + "time_per_iteration": 3.393174886703491 + }, + { + "auxiliary_loss_clip": 0.0111457, + "auxiliary_loss_mlp": 0.01085207, + "balance_loss_clip": 1.02403522, + "balance_loss_mlp": 1.00500286, + "epoch": 0.6799735465640594, + "flos": 16180163792640.0, + "grad_norm": 2.3632812285862452, + "language_loss": 0.72601807, + "learning_rate": 9.813942304536154e-07, + "loss": 0.74801588, + "num_input_tokens_seen": 121659170, + "step": 5655, + "time_per_iteration": 2.824125051498413 + }, + { + "auxiliary_loss_clip": 0.0111704, + "auxiliary_loss_mlp": 0.01083258, + "balance_loss_clip": 1.02474761, + "balance_loss_mlp": 1.00305367, + "epoch": 0.6800937894546984, + "flos": 22125749489280.0, + "grad_norm": 5.579593742278804, + "language_loss": 0.63498473, + "learning_rate": 9.807239345927043e-07, + "loss": 0.65698767, + "num_input_tokens_seen": 121679180, + "step": 5656, + "time_per_iteration": 2.6831438541412354 + }, + { + "auxiliary_loss_clip": 0.01100998, + "auxiliary_loss_mlp": 0.01083807, + "balance_loss_clip": 1.02382791, + "balance_loss_mlp": 1.00360346, + "epoch": 0.6802140323453376, + "flos": 31612953300480.0, + "grad_norm": 2.0270155490573245, + "language_loss": 0.71938121, + "learning_rate": 9.80053793355162e-07, + "loss": 0.7412293, + "num_input_tokens_seen": 121697875, + "step": 5657, + "time_per_iteration": 3.7519891262054443 + }, + { + "auxiliary_loss_clip": 0.01089119, + "auxiliary_loss_mlp": 0.01085388, + "balance_loss_clip": 1.02131224, + "balance_loss_mlp": 1.00513673, + "epoch": 0.6803342752359767, + "flos": 17712938908800.0, + "grad_norm": 1.8214722697412753, + "language_loss": 0.74704504, + "learning_rate": 9.793838068426472e-07, + "loss": 0.76879013, + "num_input_tokens_seen": 121715570, + "step": 5658, + "time_per_iteration": 2.7558019161224365 + }, + { + "auxiliary_loss_clip": 0.01135806, + "auxiliary_loss_mlp": 0.01084384, + "balance_loss_clip": 1.02656019, + "balance_loss_mlp": 1.00413215, + "epoch": 0.6804545181266157, + "flos": 11326800902400.0, + "grad_norm": 2.02935077452436, + "language_loss": 0.61356503, + "learning_rate": 9.78713975156799e-07, + "loss": 0.63576692, + "num_input_tokens_seen": 121731435, + "step": 5659, + "time_per_iteration": 2.555532217025757 + }, + { + "auxiliary_loss_clip": 0.01103381, + "auxiliary_loss_mlp": 0.01084546, + "balance_loss_clip": 1.0218637, + "balance_loss_mlp": 1.00434208, + "epoch": 0.6805747610172549, + "flos": 29350976181120.0, + "grad_norm": 1.7374548728875925, + "language_loss": 0.72084117, + "learning_rate": 9.780442983992273e-07, + "loss": 0.74272048, + "num_input_tokens_seen": 121749950, + "step": 5660, + "time_per_iteration": 2.807788610458374 + }, + { + "auxiliary_loss_clip": 0.011174, + "auxiliary_loss_mlp": 0.01084916, + "balance_loss_clip": 1.02459812, + "balance_loss_mlp": 1.00466418, + "epoch": 0.680695003907894, + "flos": 37631868612480.0, + "grad_norm": 1.6335529850974535, + "language_loss": 0.71579742, + "learning_rate": 9.773747766715238e-07, + "loss": 0.73782063, + "num_input_tokens_seen": 121770770, + "step": 5661, + "time_per_iteration": 4.696304559707642 + }, + { + "auxiliary_loss_clip": 0.01117716, + "auxiliary_loss_mlp": 0.01084671, + "balance_loss_clip": 1.02423894, + "balance_loss_mlp": 1.00446677, + "epoch": 0.680815246798533, + "flos": 22127365601280.0, + "grad_norm": 4.783192629877633, + "language_loss": 0.80325544, + "learning_rate": 9.767054100752536e-07, + "loss": 0.8252793, + "num_input_tokens_seen": 121790720, + "step": 5662, + "time_per_iteration": 3.6354334354400635 + }, + { + "auxiliary_loss_clip": 0.01086868, + "auxiliary_loss_mlp": 0.01084137, + "balance_loss_clip": 1.02552986, + "balance_loss_mlp": 1.00388503, + "epoch": 0.6809354896891722, + "flos": 17201822330880.0, + "grad_norm": 1.9937365313421969, + "language_loss": 0.81784731, + "learning_rate": 9.760361987119584e-07, + "loss": 0.83955741, + "num_input_tokens_seen": 121808455, + "step": 5663, + "time_per_iteration": 2.764380693435669 + }, + { + "auxiliary_loss_clip": 0.01114516, + "auxiliary_loss_mlp": 0.01083847, + "balance_loss_clip": 1.02369928, + "balance_loss_mlp": 1.00354767, + "epoch": 0.6810557325798112, + "flos": 12458166554880.0, + "grad_norm": 3.3519502101846648, + "language_loss": 0.67756069, + "learning_rate": 9.753671426831592e-07, + "loss": 0.69954437, + "num_input_tokens_seen": 121824470, + "step": 5664, + "time_per_iteration": 2.680534601211548 + }, + { + "auxiliary_loss_clip": 0.01125298, + "auxiliary_loss_mlp": 0.01084451, + "balance_loss_clip": 1.02402806, + "balance_loss_mlp": 1.00419974, + "epoch": 0.6811759754704503, + "flos": 22156165330560.0, + "grad_norm": 2.0614785505771565, + "language_loss": 0.795663, + "learning_rate": 9.746982420903483e-07, + "loss": 0.81776053, + "num_input_tokens_seen": 121842665, + "step": 5665, + "time_per_iteration": 2.7029714584350586 + }, + { + "auxiliary_loss_clip": 0.0112138, + "auxiliary_loss_mlp": 0.01083842, + "balance_loss_clip": 1.02242923, + "balance_loss_mlp": 1.00378156, + "epoch": 0.6812962183610894, + "flos": 17525377065600.0, + "grad_norm": 2.4441484921105734, + "language_loss": 0.74802423, + "learning_rate": 9.740294970349993e-07, + "loss": 0.77007645, + "num_input_tokens_seen": 121859080, + "step": 5666, + "time_per_iteration": 2.6725947856903076 + }, + { + "auxiliary_loss_clip": 0.01100741, + "auxiliary_loss_mlp": 0.01079507, + "balance_loss_clip": 1.02164328, + "balance_loss_mlp": 1.00054264, + "epoch": 0.6814164612517285, + "flos": 60274480855680.0, + "grad_norm": 0.8913937896238966, + "language_loss": 0.6096046, + "learning_rate": 9.733609076185594e-07, + "loss": 0.63140714, + "num_input_tokens_seen": 121915485, + "step": 5667, + "time_per_iteration": 3.1395857334136963 + }, + { + "auxiliary_loss_clip": 0.01127009, + "auxiliary_loss_mlp": 0.01083044, + "balance_loss_clip": 1.0262394, + "balance_loss_mlp": 1.00288773, + "epoch": 0.6815367041423676, + "flos": 19317750750720.0, + "grad_norm": 6.40455842874306, + "language_loss": 0.840841, + "learning_rate": 9.72692473942455e-07, + "loss": 0.8629415, + "num_input_tokens_seen": 121932710, + "step": 5668, + "time_per_iteration": 2.6688714027404785 + }, + { + "auxiliary_loss_clip": 0.01091971, + "auxiliary_loss_mlp": 0.01085163, + "balance_loss_clip": 1.01927722, + "balance_loss_mlp": 1.00481617, + "epoch": 0.6816569470330067, + "flos": 22161696024960.0, + "grad_norm": 1.7120309286976398, + "language_loss": 0.77495623, + "learning_rate": 9.720241961080849e-07, + "loss": 0.79672754, + "num_input_tokens_seen": 121952025, + "step": 5669, + "time_per_iteration": 2.749136209487915 + }, + { + "auxiliary_loss_clip": 0.01134808, + "auxiliary_loss_mlp": 0.01084029, + "balance_loss_clip": 1.02485991, + "balance_loss_mlp": 1.00382543, + "epoch": 0.6817771899236458, + "flos": 41463501137280.0, + "grad_norm": 1.8295535029016663, + "language_loss": 0.73236638, + "learning_rate": 9.713560742168259e-07, + "loss": 0.75455475, + "num_input_tokens_seen": 121974650, + "step": 5670, + "time_per_iteration": 2.802563428878784 + }, + { + "auxiliary_loss_clip": 0.01091814, + "auxiliary_loss_mlp": 0.01083372, + "balance_loss_clip": 1.02504814, + "balance_loss_mlp": 1.00331116, + "epoch": 0.6818974328142848, + "flos": 21106138026240.0, + "grad_norm": 2.3964493643908775, + "language_loss": 0.71375036, + "learning_rate": 9.706881083700333e-07, + "loss": 0.73550218, + "num_input_tokens_seen": 121994335, + "step": 5671, + "time_per_iteration": 2.6905734539031982 + }, + { + "auxiliary_loss_clip": 0.0107645, + "auxiliary_loss_mlp": 0.01084994, + "balance_loss_clip": 1.01975393, + "balance_loss_mlp": 1.00478959, + "epoch": 0.682017675704924, + "flos": 20441897769600.0, + "grad_norm": 2.014505025120831, + "language_loss": 0.82819629, + "learning_rate": 9.700202986690357e-07, + "loss": 0.84981072, + "num_input_tokens_seen": 122012635, + "step": 5672, + "time_per_iteration": 2.84536075592041 + }, + { + "auxiliary_loss_clip": 0.01124124, + "auxiliary_loss_mlp": 0.00872976, + "balance_loss_clip": 1.02411485, + "balance_loss_mlp": 1.00010586, + "epoch": 0.682137918595563, + "flos": 20044438801920.0, + "grad_norm": 3.027594203473649, + "language_loss": 0.6641438, + "learning_rate": 9.693526452151413e-07, + "loss": 0.68411481, + "num_input_tokens_seen": 122031685, + "step": 5673, + "time_per_iteration": 2.6590065956115723 + }, + { + "auxiliary_loss_clip": 0.01110125, + "auxiliary_loss_mlp": 0.01084914, + "balance_loss_clip": 1.0245924, + "balance_loss_mlp": 1.00461423, + "epoch": 0.6822581614862021, + "flos": 31684559063040.0, + "grad_norm": 1.5450027039763343, + "language_loss": 0.7535578, + "learning_rate": 9.686851481096305e-07, + "loss": 0.77550822, + "num_input_tokens_seen": 122052995, + "step": 5674, + "time_per_iteration": 2.7767202854156494 + }, + { + "auxiliary_loss_clip": 0.01068252, + "auxiliary_loss_mlp": 0.01082916, + "balance_loss_clip": 1.01932573, + "balance_loss_mlp": 1.00275981, + "epoch": 0.6823784043768413, + "flos": 23477570864640.0, + "grad_norm": 1.7289801527544197, + "language_loss": 0.71531123, + "learning_rate": 9.68017807453762e-07, + "loss": 0.7368229, + "num_input_tokens_seen": 122071740, + "step": 5675, + "time_per_iteration": 2.903362989425659 + }, + { + "auxiliary_loss_clip": 0.01117817, + "auxiliary_loss_mlp": 0.00872795, + "balance_loss_clip": 1.02552617, + "balance_loss_mlp": 1.00014818, + "epoch": 0.6824986472674803, + "flos": 14137134024960.0, + "grad_norm": 1.9002296264134786, + "language_loss": 0.73267251, + "learning_rate": 9.673506233487721e-07, + "loss": 0.75257868, + "num_input_tokens_seen": 122089705, + "step": 5676, + "time_per_iteration": 2.7672159671783447 + }, + { + "auxiliary_loss_clip": 0.01116846, + "auxiliary_loss_mlp": 0.00872725, + "balance_loss_clip": 1.02412069, + "balance_loss_mlp": 1.00014722, + "epoch": 0.6826188901581194, + "flos": 21504997624320.0, + "grad_norm": 1.6077680939442622, + "language_loss": 0.86082423, + "learning_rate": 9.666835958958717e-07, + "loss": 0.88071996, + "num_input_tokens_seen": 122109025, + "step": 5677, + "time_per_iteration": 2.8408849239349365 + }, + { + "auxiliary_loss_clip": 0.01135731, + "auxiliary_loss_mlp": 0.01083911, + "balance_loss_clip": 1.02613664, + "balance_loss_mlp": 1.00375426, + "epoch": 0.6827391330487584, + "flos": 20810126044800.0, + "grad_norm": 1.8083283114155992, + "language_loss": 0.80322474, + "learning_rate": 9.660167251962484e-07, + "loss": 0.82542121, + "num_input_tokens_seen": 122127385, + "step": 5678, + "time_per_iteration": 2.6633284091949463 + }, + { + "auxiliary_loss_clip": 0.0110713, + "auxiliary_loss_mlp": 0.01083583, + "balance_loss_clip": 1.02339363, + "balance_loss_mlp": 1.00347471, + "epoch": 0.6828593759393976, + "flos": 21688788539520.0, + "grad_norm": 1.543361620188044, + "language_loss": 0.77744979, + "learning_rate": 9.653500113510654e-07, + "loss": 0.79935694, + "num_input_tokens_seen": 122146500, + "step": 5679, + "time_per_iteration": 2.8661048412323 + }, + { + "auxiliary_loss_clip": 0.01115926, + "auxiliary_loss_mlp": 0.01083229, + "balance_loss_clip": 1.02369845, + "balance_loss_mlp": 1.00307274, + "epoch": 0.6829796188300367, + "flos": 25337707557120.0, + "grad_norm": 2.4186022823410083, + "language_loss": 0.6673696, + "learning_rate": 9.646834544614627e-07, + "loss": 0.6893611, + "num_input_tokens_seen": 122167000, + "step": 5680, + "time_per_iteration": 2.7759077548980713 + }, + { + "auxiliary_loss_clip": 0.01113085, + "auxiliary_loss_mlp": 0.01083351, + "balance_loss_clip": 1.02602661, + "balance_loss_mlp": 1.00319505, + "epoch": 0.6830998617206757, + "flos": 20704800389760.0, + "grad_norm": 2.6657534746036418, + "language_loss": 0.76397145, + "learning_rate": 9.64017054628558e-07, + "loss": 0.78593576, + "num_input_tokens_seen": 122185825, + "step": 5681, + "time_per_iteration": 2.741234540939331 + }, + { + "auxiliary_loss_clip": 0.0109671, + "auxiliary_loss_mlp": 0.01083932, + "balance_loss_clip": 1.02177966, + "balance_loss_mlp": 1.00377607, + "epoch": 0.6832201046113149, + "flos": 21726638496000.0, + "grad_norm": 1.610948382010127, + "language_loss": 0.78760421, + "learning_rate": 9.63350811953441e-07, + "loss": 0.80941069, + "num_input_tokens_seen": 122206200, + "step": 5682, + "time_per_iteration": 2.8885464668273926 + }, + { + "auxiliary_loss_clip": 0.01106638, + "auxiliary_loss_mlp": 0.0108521, + "balance_loss_clip": 1.02363038, + "balance_loss_mlp": 1.00495803, + "epoch": 0.6833403475019539, + "flos": 19536554448000.0, + "grad_norm": 2.670036305429808, + "language_loss": 0.70902199, + "learning_rate": 9.626847265371826e-07, + "loss": 0.73094046, + "num_input_tokens_seen": 122225520, + "step": 5683, + "time_per_iteration": 3.5815224647521973 + }, + { + "auxiliary_loss_clip": 0.01114889, + "auxiliary_loss_mlp": 0.01082879, + "balance_loss_clip": 1.02261865, + "balance_loss_mlp": 1.0027225, + "epoch": 0.683460590392593, + "flos": 19352153001600.0, + "grad_norm": 2.1486326179626514, + "language_loss": 0.79074657, + "learning_rate": 9.620187984808262e-07, + "loss": 0.81272423, + "num_input_tokens_seen": 122244320, + "step": 5684, + "time_per_iteration": 2.8327853679656982 + }, + { + "auxiliary_loss_clip": 0.01109545, + "auxiliary_loss_mlp": 0.00872917, + "balance_loss_clip": 1.02352297, + "balance_loss_mlp": 1.00010753, + "epoch": 0.6835808332832322, + "flos": 23288500650240.0, + "grad_norm": 1.6529238685340768, + "language_loss": 0.8575778, + "learning_rate": 9.613530278853919e-07, + "loss": 0.87740242, + "num_input_tokens_seen": 122264295, + "step": 5685, + "time_per_iteration": 2.747295379638672 + }, + { + "auxiliary_loss_clip": 0.01120101, + "auxiliary_loss_mlp": 0.01084134, + "balance_loss_clip": 1.02104521, + "balance_loss_mlp": 1.00397754, + "epoch": 0.6837010761738712, + "flos": 21653416621440.0, + "grad_norm": 1.5963478379537286, + "language_loss": 0.74405926, + "learning_rate": 9.60687414851879e-07, + "loss": 0.7661016, + "num_input_tokens_seen": 122285300, + "step": 5686, + "time_per_iteration": 4.673339128494263 + }, + { + "auxiliary_loss_clip": 0.01094252, + "auxiliary_loss_mlp": 0.0108582, + "balance_loss_clip": 1.02448702, + "balance_loss_mlp": 1.00556839, + "epoch": 0.6838213190645103, + "flos": 17566387418880.0, + "grad_norm": 2.1988910700007356, + "language_loss": 0.76959944, + "learning_rate": 9.600219594812575e-07, + "loss": 0.79140019, + "num_input_tokens_seen": 122303240, + "step": 5687, + "time_per_iteration": 2.7633056640625 + }, + { + "auxiliary_loss_clip": 0.01135568, + "auxiliary_loss_mlp": 0.01085179, + "balance_loss_clip": 1.02597547, + "balance_loss_mlp": 1.00507033, + "epoch": 0.6839415619551494, + "flos": 23112538899840.0, + "grad_norm": 1.6546742242192174, + "language_loss": 0.72739261, + "learning_rate": 9.593566618744786e-07, + "loss": 0.74960005, + "num_input_tokens_seen": 122323390, + "step": 5688, + "time_per_iteration": 3.5797502994537354 + }, + { + "auxiliary_loss_clip": 0.01135248, + "auxiliary_loss_mlp": 0.0108417, + "balance_loss_clip": 1.0256598, + "balance_loss_mlp": 1.00401366, + "epoch": 0.6840618048457885, + "flos": 22127868391680.0, + "grad_norm": 1.6740366739187982, + "language_loss": 0.73480392, + "learning_rate": 9.58691522132466e-07, + "loss": 0.75699806, + "num_input_tokens_seen": 122342200, + "step": 5689, + "time_per_iteration": 2.604417085647583 + }, + { + "auxiliary_loss_clip": 0.0111573, + "auxiliary_loss_mlp": 0.01084135, + "balance_loss_clip": 1.02391624, + "balance_loss_mlp": 1.00393081, + "epoch": 0.6841820477364275, + "flos": 22015898720640.0, + "grad_norm": 2.197902593317415, + "language_loss": 0.84917367, + "learning_rate": 9.58026540356123e-07, + "loss": 0.87117231, + "num_input_tokens_seen": 122360465, + "step": 5690, + "time_per_iteration": 2.69077730178833 + }, + { + "auxiliary_loss_clip": 0.01127049, + "auxiliary_loss_mlp": 0.01084366, + "balance_loss_clip": 1.02584124, + "balance_loss_mlp": 1.00411451, + "epoch": 0.6843022906270667, + "flos": 24900531125760.0, + "grad_norm": 1.663522446335069, + "language_loss": 0.86755753, + "learning_rate": 9.573617166463246e-07, + "loss": 0.88967168, + "num_input_tokens_seen": 122381680, + "step": 5691, + "time_per_iteration": 2.66143798828125 + }, + { + "auxiliary_loss_clip": 0.01116028, + "auxiliary_loss_mlp": 0.01083794, + "balance_loss_clip": 1.02353442, + "balance_loss_mlp": 1.00363803, + "epoch": 0.6844225335177058, + "flos": 19969924037760.0, + "grad_norm": 1.834260460313177, + "language_loss": 0.59686232, + "learning_rate": 9.56697051103924e-07, + "loss": 0.61886054, + "num_input_tokens_seen": 122399120, + "step": 5692, + "time_per_iteration": 2.7621045112609863 + }, + { + "auxiliary_loss_clip": 0.01116169, + "auxiliary_loss_mlp": 0.01084096, + "balance_loss_clip": 1.02366209, + "balance_loss_mlp": 1.00403535, + "epoch": 0.6845427764083448, + "flos": 25883334126720.0, + "grad_norm": 2.0162869967136663, + "language_loss": 0.81005782, + "learning_rate": 9.560325438297522e-07, + "loss": 0.83206046, + "num_input_tokens_seen": 122417430, + "step": 5693, + "time_per_iteration": 2.7414674758911133 + }, + { + "auxiliary_loss_clip": 0.0111166, + "auxiliary_loss_mlp": 0.01084999, + "balance_loss_clip": 1.02144241, + "balance_loss_mlp": 1.0048902, + "epoch": 0.684663019298984, + "flos": 18880143356160.0, + "grad_norm": 1.8713889191565776, + "language_loss": 0.86854947, + "learning_rate": 9.553681949246127e-07, + "loss": 0.89051604, + "num_input_tokens_seen": 122435055, + "step": 5694, + "time_per_iteration": 2.7739200592041016 + }, + { + "auxiliary_loss_clip": 0.01102055, + "auxiliary_loss_mlp": 0.01085669, + "balance_loss_clip": 1.02341485, + "balance_loss_mlp": 1.00527465, + "epoch": 0.684783262189623, + "flos": 54193725302400.0, + "grad_norm": 1.7514760618763217, + "language_loss": 0.75605518, + "learning_rate": 9.547040044892886e-07, + "loss": 0.77793247, + "num_input_tokens_seen": 122462570, + "step": 5695, + "time_per_iteration": 2.991136312484741 + }, + { + "auxiliary_loss_clip": 0.01109587, + "auxiliary_loss_mlp": 0.01079189, + "balance_loss_clip": 1.02181542, + "balance_loss_mlp": 1.00022495, + "epoch": 0.6849035050802621, + "flos": 63970264143360.0, + "grad_norm": 0.8596181155142325, + "language_loss": 0.60176593, + "learning_rate": 9.540399726245354e-07, + "loss": 0.62365371, + "num_input_tokens_seen": 122519275, + "step": 5696, + "time_per_iteration": 3.1018950939178467 + }, + { + "auxiliary_loss_clip": 0.01118624, + "auxiliary_loss_mlp": 0.01083979, + "balance_loss_clip": 1.024997, + "balance_loss_mlp": 1.003775, + "epoch": 0.6850237479709013, + "flos": 25224121774080.0, + "grad_norm": 1.864865049129347, + "language_loss": 0.68860179, + "learning_rate": 9.533760994310859e-07, + "loss": 0.71062785, + "num_input_tokens_seen": 122539675, + "step": 5697, + "time_per_iteration": 2.774838447570801 + }, + { + "auxiliary_loss_clip": 0.01135301, + "auxiliary_loss_mlp": 0.01085465, + "balance_loss_clip": 1.02536488, + "balance_loss_mlp": 1.00530863, + "epoch": 0.6851439908615403, + "flos": 19354128249600.0, + "grad_norm": 1.7887595320144845, + "language_loss": 0.74771959, + "learning_rate": 9.527123850096508e-07, + "loss": 0.76992726, + "num_input_tokens_seen": 122558035, + "step": 5698, + "time_per_iteration": 2.714158773422241 + }, + { + "auxiliary_loss_clip": 0.01110212, + "auxiliary_loss_mlp": 0.01084339, + "balance_loss_clip": 1.02511132, + "balance_loss_mlp": 1.00413489, + "epoch": 0.6852642337521794, + "flos": 23182133500800.0, + "grad_norm": 2.0255382104324826, + "language_loss": 0.71545672, + "learning_rate": 9.520488294609142e-07, + "loss": 0.7374022, + "num_input_tokens_seen": 122576815, + "step": 5699, + "time_per_iteration": 2.6930105686187744 + }, + { + "auxiliary_loss_clip": 0.01084294, + "auxiliary_loss_mlp": 0.01079069, + "balance_loss_clip": 1.02204728, + "balance_loss_mlp": 1.00010526, + "epoch": 0.6853844766428185, + "flos": 62647206583680.0, + "grad_norm": 0.7381812075498949, + "language_loss": 0.53882921, + "learning_rate": 9.513854328855368e-07, + "loss": 0.56046283, + "num_input_tokens_seen": 122634690, + "step": 5700, + "time_per_iteration": 3.334878444671631 + }, + { + "auxiliary_loss_clip": 0.01133273, + "auxiliary_loss_mlp": 0.01084079, + "balance_loss_clip": 1.02391982, + "balance_loss_mlp": 1.00401855, + "epoch": 0.6855047195334576, + "flos": 23437242869760.0, + "grad_norm": 1.9078734887289646, + "language_loss": 0.81162262, + "learning_rate": 9.507221953841558e-07, + "loss": 0.83379614, + "num_input_tokens_seen": 122652320, + "step": 5701, + "time_per_iteration": 2.7209508419036865 + }, + { + "auxiliary_loss_clip": 0.01127384, + "auxiliary_loss_mlp": 0.01085026, + "balance_loss_clip": 1.02658868, + "balance_loss_mlp": 1.00477457, + "epoch": 0.6856249624240967, + "flos": 20664831530880.0, + "grad_norm": 1.485491413173715, + "language_loss": 0.77747029, + "learning_rate": 9.500591170573824e-07, + "loss": 0.7995944, + "num_input_tokens_seen": 122672340, + "step": 5702, + "time_per_iteration": 2.676431894302368 + }, + { + "auxiliary_loss_clip": 0.01094302, + "auxiliary_loss_mlp": 0.01085133, + "balance_loss_clip": 1.02102423, + "balance_loss_mlp": 1.0049293, + "epoch": 0.6857452053147358, + "flos": 17087302794240.0, + "grad_norm": 2.029106072978938, + "language_loss": 0.74199587, + "learning_rate": 9.493961980058078e-07, + "loss": 0.76379025, + "num_input_tokens_seen": 122689935, + "step": 5703, + "time_per_iteration": 2.7402853965759277 + }, + { + "auxiliary_loss_clip": 0.01089944, + "auxiliary_loss_mlp": 0.01083731, + "balance_loss_clip": 1.02262115, + "balance_loss_mlp": 1.0036701, + "epoch": 0.6858654482053749, + "flos": 30847266057600.0, + "grad_norm": 1.7115731811684873, + "language_loss": 0.67487741, + "learning_rate": 9.48733438329993e-07, + "loss": 0.69661415, + "num_input_tokens_seen": 122710200, + "step": 5704, + "time_per_iteration": 2.9153645038604736 + }, + { + "auxiliary_loss_clip": 0.01135841, + "auxiliary_loss_mlp": 0.00872864, + "balance_loss_clip": 1.02650142, + "balance_loss_mlp": 1.00015891, + "epoch": 0.6859856910960139, + "flos": 28877314510080.0, + "grad_norm": 1.6066820241594004, + "language_loss": 0.74547535, + "learning_rate": 9.480708381304807e-07, + "loss": 0.76556242, + "num_input_tokens_seen": 122731495, + "step": 5705, + "time_per_iteration": 2.697010040283203 + }, + { + "auxiliary_loss_clip": 0.01089498, + "auxiliary_loss_mlp": 0.01085055, + "balance_loss_clip": 1.02258778, + "balance_loss_mlp": 1.00489902, + "epoch": 0.6861059339866531, + "flos": 19354523299200.0, + "grad_norm": 1.9036091061115954, + "language_loss": 0.83652973, + "learning_rate": 9.474083975077858e-07, + "loss": 0.85827523, + "num_input_tokens_seen": 122748620, + "step": 5706, + "time_per_iteration": 2.7749736309051514 + }, + { + "auxiliary_loss_clip": 0.01127636, + "auxiliary_loss_mlp": 0.01083681, + "balance_loss_clip": 1.02572179, + "balance_loss_mlp": 1.00347686, + "epoch": 0.6862261768772921, + "flos": 22199976944640.0, + "grad_norm": 2.1244851788408643, + "language_loss": 0.79923093, + "learning_rate": 9.467461165623994e-07, + "loss": 0.82134414, + "num_input_tokens_seen": 122767670, + "step": 5707, + "time_per_iteration": 2.6874921321868896 + }, + { + "auxiliary_loss_clip": 0.01126904, + "auxiliary_loss_mlp": 0.01083594, + "balance_loss_clip": 1.0250535, + "balance_loss_mlp": 1.00348508, + "epoch": 0.6863464197679312, + "flos": 26285677344000.0, + "grad_norm": 1.9807740256135862, + "language_loss": 0.79590368, + "learning_rate": 9.46083995394791e-07, + "loss": 0.81800866, + "num_input_tokens_seen": 122785480, + "step": 5708, + "time_per_iteration": 3.5952389240264893 + }, + { + "auxiliary_loss_clip": 0.0112547, + "auxiliary_loss_mlp": 0.00872751, + "balance_loss_clip": 1.02450776, + "balance_loss_mlp": 1.0000633, + "epoch": 0.6864666626585703, + "flos": 37815228564480.0, + "grad_norm": 1.833002069688815, + "language_loss": 0.63385719, + "learning_rate": 9.454220341054012e-07, + "loss": 0.65383935, + "num_input_tokens_seen": 122810265, + "step": 5709, + "time_per_iteration": 2.805971622467041 + }, + { + "auxiliary_loss_clip": 0.01106035, + "auxiliary_loss_mlp": 0.01083305, + "balance_loss_clip": 1.02327001, + "balance_loss_mlp": 1.00319648, + "epoch": 0.6865869055492094, + "flos": 19391152193280.0, + "grad_norm": 1.8470738184582387, + "language_loss": 0.80384707, + "learning_rate": 9.447602327946512e-07, + "loss": 0.82574046, + "num_input_tokens_seen": 122828905, + "step": 5710, + "time_per_iteration": 2.7648682594299316 + }, + { + "auxiliary_loss_clip": 0.01118174, + "auxiliary_loss_mlp": 0.01084577, + "balance_loss_clip": 1.02384043, + "balance_loss_mlp": 1.00422978, + "epoch": 0.6867071484398485, + "flos": 20375966355840.0, + "grad_norm": 1.6902871748558155, + "language_loss": 0.76562428, + "learning_rate": 9.440985915629338e-07, + "loss": 0.78765178, + "num_input_tokens_seen": 122846235, + "step": 5711, + "time_per_iteration": 3.5929019451141357 + }, + { + "auxiliary_loss_clip": 0.01136566, + "auxiliary_loss_mlp": 0.01084298, + "balance_loss_clip": 1.02706909, + "balance_loss_mlp": 1.00418973, + "epoch": 0.6868273913304875, + "flos": 15889143801600.0, + "grad_norm": 1.9964649155845609, + "language_loss": 0.73405635, + "learning_rate": 9.434371105106223e-07, + "loss": 0.75626504, + "num_input_tokens_seen": 122863835, + "step": 5712, + "time_per_iteration": 3.526419162750244 + }, + { + "auxiliary_loss_clip": 0.01108148, + "auxiliary_loss_mlp": 0.01085134, + "balance_loss_clip": 1.02382481, + "balance_loss_mlp": 1.00488234, + "epoch": 0.6869476342211267, + "flos": 24462492768000.0, + "grad_norm": 1.8863151828639195, + "language_loss": 0.7080853, + "learning_rate": 9.427757897380602e-07, + "loss": 0.73001814, + "num_input_tokens_seen": 122883235, + "step": 5713, + "time_per_iteration": 3.9163265228271484 + }, + { + "auxiliary_loss_clip": 0.01103989, + "auxiliary_loss_mlp": 0.0108406, + "balance_loss_clip": 1.02251387, + "balance_loss_mlp": 1.0038085, + "epoch": 0.6870678771117658, + "flos": 18442571875200.0, + "grad_norm": 2.9021685771402446, + "language_loss": 0.85177416, + "learning_rate": 9.421146293455695e-07, + "loss": 0.8736546, + "num_input_tokens_seen": 122898975, + "step": 5714, + "time_per_iteration": 2.7807464599609375 + }, + { + "auxiliary_loss_clip": 0.01117538, + "auxiliary_loss_mlp": 0.0108411, + "balance_loss_clip": 1.02414966, + "balance_loss_mlp": 1.00390625, + "epoch": 0.6871881200024048, + "flos": 22200371994240.0, + "grad_norm": 1.7514669791702744, + "language_loss": 0.68709767, + "learning_rate": 9.414536294334489e-07, + "loss": 0.70911407, + "num_input_tokens_seen": 122918995, + "step": 5715, + "time_per_iteration": 2.6681888103485107 + }, + { + "auxiliary_loss_clip": 0.01119068, + "auxiliary_loss_mlp": 0.01083845, + "balance_loss_clip": 1.02469027, + "balance_loss_mlp": 1.00364137, + "epoch": 0.687308362893044, + "flos": 22127724737280.0, + "grad_norm": 1.8503238402913285, + "language_loss": 0.69846153, + "learning_rate": 9.407927901019708e-07, + "loss": 0.72049069, + "num_input_tokens_seen": 122938125, + "step": 5716, + "time_per_iteration": 2.767448902130127 + }, + { + "auxiliary_loss_clip": 0.0112601, + "auxiliary_loss_mlp": 0.01084477, + "balance_loss_clip": 1.02482939, + "balance_loss_mlp": 1.00441575, + "epoch": 0.687428605783683, + "flos": 25040546340480.0, + "grad_norm": 2.0398117935394966, + "language_loss": 0.7679559, + "learning_rate": 9.401321114513854e-07, + "loss": 0.79006076, + "num_input_tokens_seen": 122957020, + "step": 5717, + "time_per_iteration": 2.6810293197631836 + }, + { + "auxiliary_loss_clip": 0.01134626, + "auxiliary_loss_mlp": 0.01084777, + "balance_loss_clip": 1.02540445, + "balance_loss_mlp": 1.00452495, + "epoch": 0.6875488486743221, + "flos": 23770063313280.0, + "grad_norm": 1.500236097653947, + "language_loss": 0.7509129, + "learning_rate": 9.394715935819155e-07, + "loss": 0.77310693, + "num_input_tokens_seen": 122977410, + "step": 5718, + "time_per_iteration": 2.6959753036499023 + }, + { + "auxiliary_loss_clip": 0.01126905, + "auxiliary_loss_mlp": 0.01083995, + "balance_loss_clip": 1.02523601, + "balance_loss_mlp": 1.00379109, + "epoch": 0.6876690915649613, + "flos": 25516937445120.0, + "grad_norm": 1.861160699952656, + "language_loss": 0.62976336, + "learning_rate": 9.388112365937608e-07, + "loss": 0.65187234, + "num_input_tokens_seen": 122996875, + "step": 5719, + "time_per_iteration": 2.831916570663452 + }, + { + "auxiliary_loss_clip": 0.0110838, + "auxiliary_loss_mlp": 0.01085072, + "balance_loss_clip": 1.02364898, + "balance_loss_mlp": 1.00486815, + "epoch": 0.6877893344556003, + "flos": 19427996568960.0, + "grad_norm": 1.9198981399287114, + "language_loss": 0.82347393, + "learning_rate": 9.381510405870985e-07, + "loss": 0.8454085, + "num_input_tokens_seen": 123015890, + "step": 5720, + "time_per_iteration": 2.760532855987549 + }, + { + "auxiliary_loss_clip": 0.01127311, + "auxiliary_loss_mlp": 0.01085021, + "balance_loss_clip": 1.0252502, + "balance_loss_mlp": 1.00476933, + "epoch": 0.6879095773462394, + "flos": 18661303745280.0, + "grad_norm": 2.199516926918029, + "language_loss": 0.77312291, + "learning_rate": 9.374910056620791e-07, + "loss": 0.79524624, + "num_input_tokens_seen": 123034955, + "step": 5721, + "time_per_iteration": 2.674745798110962 + }, + { + "auxiliary_loss_clip": 0.01126939, + "auxiliary_loss_mlp": 0.01084966, + "balance_loss_clip": 1.02660489, + "balance_loss_mlp": 1.00466681, + "epoch": 0.6880298202368785, + "flos": 20883132437760.0, + "grad_norm": 1.7177484107113694, + "language_loss": 0.81042403, + "learning_rate": 9.368311319188293e-07, + "loss": 0.83254308, + "num_input_tokens_seen": 123052770, + "step": 5722, + "time_per_iteration": 2.662158489227295 + }, + { + "auxiliary_loss_clip": 0.01105502, + "auxiliary_loss_mlp": 0.01084116, + "balance_loss_clip": 1.02276754, + "balance_loss_mlp": 1.00391245, + "epoch": 0.6881500631275176, + "flos": 30153292318080.0, + "grad_norm": 1.6492870708438387, + "language_loss": 0.79208082, + "learning_rate": 9.361714194574515e-07, + "loss": 0.813977, + "num_input_tokens_seen": 123075105, + "step": 5723, + "time_per_iteration": 2.8124427795410156 + }, + { + "auxiliary_loss_clip": 0.01116641, + "auxiliary_loss_mlp": 0.01079373, + "balance_loss_clip": 1.02110457, + "balance_loss_mlp": 1.00040865, + "epoch": 0.6882703060181566, + "flos": 66181537215360.0, + "grad_norm": 0.7374104760796618, + "language_loss": 0.58302212, + "learning_rate": 9.355118683780228e-07, + "loss": 0.60498226, + "num_input_tokens_seen": 123145175, + "step": 5724, + "time_per_iteration": 3.298884153366089 + }, + { + "auxiliary_loss_clip": 0.01134946, + "auxiliary_loss_mlp": 0.01083313, + "balance_loss_clip": 1.02534652, + "balance_loss_mlp": 1.00310922, + "epoch": 0.6883905489087958, + "flos": 18214646123520.0, + "grad_norm": 1.9250625049112098, + "language_loss": 0.79610443, + "learning_rate": 9.348524787805987e-07, + "loss": 0.81828701, + "num_input_tokens_seen": 123160365, + "step": 5725, + "time_per_iteration": 2.5674238204956055 + }, + { + "auxiliary_loss_clip": 0.01108727, + "auxiliary_loss_mlp": 0.01084393, + "balance_loss_clip": 1.02361512, + "balance_loss_mlp": 1.00423717, + "epoch": 0.6885107917994349, + "flos": 14056262553600.0, + "grad_norm": 2.3329913336853005, + "language_loss": 0.84557366, + "learning_rate": 9.341932507652053e-07, + "loss": 0.86750489, + "num_input_tokens_seen": 123174855, + "step": 5726, + "time_per_iteration": 2.700704574584961 + }, + { + "auxiliary_loss_clip": 0.01118224, + "auxiliary_loss_mlp": 0.01084181, + "balance_loss_clip": 1.02430558, + "balance_loss_mlp": 1.00392926, + "epoch": 0.6886310346900739, + "flos": 28690722334080.0, + "grad_norm": 1.6702235021638976, + "language_loss": 0.78934699, + "learning_rate": 9.335341844318489e-07, + "loss": 0.81137109, + "num_input_tokens_seen": 123194995, + "step": 5727, + "time_per_iteration": 2.738722324371338 + }, + { + "auxiliary_loss_clip": 0.01113826, + "auxiliary_loss_mlp": 0.01084587, + "balance_loss_clip": 1.02237391, + "balance_loss_mlp": 1.00443053, + "epoch": 0.6887512775807131, + "flos": 24535319592960.0, + "grad_norm": 1.8995648220818881, + "language_loss": 0.73723149, + "learning_rate": 9.328752798805091e-07, + "loss": 0.75921559, + "num_input_tokens_seen": 123213465, + "step": 5728, + "time_per_iteration": 2.6940724849700928 + }, + { + "auxiliary_loss_clip": 0.01124982, + "auxiliary_loss_mlp": 0.01085248, + "balance_loss_clip": 1.02499139, + "balance_loss_mlp": 1.00504398, + "epoch": 0.6888715204713521, + "flos": 22414363269120.0, + "grad_norm": 1.8965343528001966, + "language_loss": 0.75829518, + "learning_rate": 9.322165372111399e-07, + "loss": 0.78039747, + "num_input_tokens_seen": 123231610, + "step": 5729, + "time_per_iteration": 2.6870815753936768 + }, + { + "auxiliary_loss_clip": 0.01104091, + "auxiliary_loss_mlp": 0.01085014, + "balance_loss_clip": 1.02249277, + "balance_loss_mlp": 1.00485754, + "epoch": 0.6889917633619912, + "flos": 22054323294720.0, + "grad_norm": 1.8100741576568538, + "language_loss": 0.75668395, + "learning_rate": 9.315579565236747e-07, + "loss": 0.778575, + "num_input_tokens_seen": 123250715, + "step": 5730, + "time_per_iteration": 2.821579694747925 + }, + { + "auxiliary_loss_clip": 0.01110259, + "auxiliary_loss_mlp": 0.0108521, + "balance_loss_clip": 1.02121425, + "balance_loss_mlp": 1.0050056, + "epoch": 0.6891120062526304, + "flos": 23949724164480.0, + "grad_norm": 1.6684991131632998, + "language_loss": 0.7389279, + "learning_rate": 9.308995379180162e-07, + "loss": 0.76088262, + "num_input_tokens_seen": 123270270, + "step": 5731, + "time_per_iteration": 2.722620725631714 + }, + { + "auxiliary_loss_clip": 0.01108663, + "auxiliary_loss_mlp": 0.01079328, + "balance_loss_clip": 1.02111626, + "balance_loss_mlp": 1.00036418, + "epoch": 0.6892322491432694, + "flos": 64117354337280.0, + "grad_norm": 0.7358449495588781, + "language_loss": 0.5955615, + "learning_rate": 9.302412814940488e-07, + "loss": 0.61744142, + "num_input_tokens_seen": 123333045, + "step": 5732, + "time_per_iteration": 3.364981174468994 + }, + { + "auxiliary_loss_clip": 0.01119297, + "auxiliary_loss_mlp": 0.01084216, + "balance_loss_clip": 1.02547133, + "balance_loss_mlp": 1.00401211, + "epoch": 0.6893524920339085, + "flos": 23002436736000.0, + "grad_norm": 1.842153950424682, + "language_loss": 0.70915556, + "learning_rate": 9.295831873516276e-07, + "loss": 0.73119074, + "num_input_tokens_seen": 123352320, + "step": 5733, + "time_per_iteration": 2.656308174133301 + }, + { + "auxiliary_loss_clip": 0.0113626, + "auxiliary_loss_mlp": 0.01084153, + "balance_loss_clip": 1.02685571, + "balance_loss_mlp": 1.00404501, + "epoch": 0.6894727349245476, + "flos": 21396260177280.0, + "grad_norm": 1.5726773753531134, + "language_loss": 0.75942111, + "learning_rate": 9.289252555905873e-07, + "loss": 0.78162527, + "num_input_tokens_seen": 123372400, + "step": 5734, + "time_per_iteration": 3.512134075164795 + }, + { + "auxiliary_loss_clip": 0.0112531, + "auxiliary_loss_mlp": 0.01084641, + "balance_loss_clip": 1.02571547, + "balance_loss_mlp": 1.00443673, + "epoch": 0.6895929778151867, + "flos": 19865316654720.0, + "grad_norm": 2.0150128302940944, + "language_loss": 0.75973868, + "learning_rate": 9.282674863107334e-07, + "loss": 0.78183818, + "num_input_tokens_seen": 123390215, + "step": 5735, + "time_per_iteration": 2.667914390563965 + }, + { + "auxiliary_loss_clip": 0.0111993, + "auxiliary_loss_mlp": 0.01085373, + "balance_loss_clip": 1.02538157, + "balance_loss_mlp": 1.00526428, + "epoch": 0.6897132207058257, + "flos": 18179166464640.0, + "grad_norm": 2.038754016875984, + "language_loss": 0.75658834, + "learning_rate": 9.276098796118488e-07, + "loss": 0.77864134, + "num_input_tokens_seen": 123406700, + "step": 5736, + "time_per_iteration": 2.653608798980713 + }, + { + "auxiliary_loss_clip": 0.01116343, + "auxiliary_loss_mlp": 0.01083722, + "balance_loss_clip": 1.02442718, + "balance_loss_mlp": 1.00356543, + "epoch": 0.6898334635964649, + "flos": 32561641359360.0, + "grad_norm": 1.859900799264365, + "language_loss": 0.66272479, + "learning_rate": 9.269524355936938e-07, + "loss": 0.68472546, + "num_input_tokens_seen": 123429880, + "step": 5737, + "time_per_iteration": 4.61829686164856 + }, + { + "auxiliary_loss_clip": 0.01115227, + "auxiliary_loss_mlp": 0.0108536, + "balance_loss_clip": 1.02273142, + "balance_loss_mlp": 1.0051558, + "epoch": 0.689953706487104, + "flos": 22819004956800.0, + "grad_norm": 1.6122816263868103, + "language_loss": 0.84903115, + "learning_rate": 9.262951543560002e-07, + "loss": 0.87103701, + "num_input_tokens_seen": 123449105, + "step": 5738, + "time_per_iteration": 3.5589234828948975 + }, + { + "auxiliary_loss_clip": 0.01110323, + "auxiliary_loss_mlp": 0.01083447, + "balance_loss_clip": 1.02093303, + "balance_loss_mlp": 1.00309968, + "epoch": 0.690073949377743, + "flos": 18515362786560.0, + "grad_norm": 3.0961013571753218, + "language_loss": 0.86260366, + "learning_rate": 9.256380359984795e-07, + "loss": 0.88454133, + "num_input_tokens_seen": 123466215, + "step": 5739, + "time_per_iteration": 2.6479909420013428 + }, + { + "auxiliary_loss_clip": 0.01100764, + "auxiliary_loss_mlp": 0.0108333, + "balance_loss_clip": 1.02326059, + "balance_loss_mlp": 1.00317335, + "epoch": 0.6901941922683821, + "flos": 34857194716800.0, + "grad_norm": 1.7996589590577068, + "language_loss": 0.74953777, + "learning_rate": 9.249810806208139e-07, + "loss": 0.77137864, + "num_input_tokens_seen": 123485480, + "step": 5740, + "time_per_iteration": 2.9552671909332275 + }, + { + "auxiliary_loss_clip": 0.01099342, + "auxiliary_loss_mlp": 0.00872844, + "balance_loss_clip": 1.02274656, + "balance_loss_mlp": 1.00013828, + "epoch": 0.6903144351590212, + "flos": 16253672976000.0, + "grad_norm": 1.8582862528031938, + "language_loss": 0.80346107, + "learning_rate": 9.243242883226627e-07, + "loss": 0.82318288, + "num_input_tokens_seen": 123504575, + "step": 5741, + "time_per_iteration": 2.83909010887146 + }, + { + "auxiliary_loss_clip": 0.01124727, + "auxiliary_loss_mlp": 0.01083703, + "balance_loss_clip": 1.02296376, + "balance_loss_mlp": 1.0034517, + "epoch": 0.6904346780496603, + "flos": 28035137255040.0, + "grad_norm": 1.9447499296128525, + "language_loss": 0.69628835, + "learning_rate": 9.236676592036628e-07, + "loss": 0.7183727, + "num_input_tokens_seen": 123524250, + "step": 5742, + "time_per_iteration": 2.7654929161071777 + }, + { + "auxiliary_loss_clip": 0.01116173, + "auxiliary_loss_mlp": 0.01084306, + "balance_loss_clip": 1.02540278, + "balance_loss_mlp": 1.00419736, + "epoch": 0.6905549209402994, + "flos": 23624266008960.0, + "grad_norm": 1.7457045298319935, + "language_loss": 0.7386325, + "learning_rate": 9.230111933634228e-07, + "loss": 0.76063728, + "num_input_tokens_seen": 123545845, + "step": 5743, + "time_per_iteration": 2.807365655899048 + }, + { + "auxiliary_loss_clip": 0.01125848, + "auxiliary_loss_mlp": 0.01084087, + "balance_loss_clip": 1.02513838, + "balance_loss_mlp": 1.00393069, + "epoch": 0.6906751638309385, + "flos": 23114945111040.0, + "grad_norm": 1.6746805797890516, + "language_loss": 0.80752325, + "learning_rate": 9.223548909015288e-07, + "loss": 0.82962263, + "num_input_tokens_seen": 123567535, + "step": 5744, + "time_per_iteration": 2.709580898284912 + }, + { + "auxiliary_loss_clip": 0.01096554, + "auxiliary_loss_mlp": 0.01084201, + "balance_loss_clip": 1.02230644, + "balance_loss_mlp": 1.00409269, + "epoch": 0.6907954067215776, + "flos": 27305468375040.0, + "grad_norm": 1.7969654324005575, + "language_loss": 0.72042835, + "learning_rate": 9.216987519175407e-07, + "loss": 0.7422359, + "num_input_tokens_seen": 123587710, + "step": 5745, + "time_per_iteration": 2.8450541496276855 + }, + { + "auxiliary_loss_clip": 0.01118808, + "auxiliary_loss_mlp": 0.01083197, + "balance_loss_clip": 1.02463078, + "balance_loss_mlp": 1.00313616, + "epoch": 0.6909156496122166, + "flos": 21689399070720.0, + "grad_norm": 1.8693938837229593, + "language_loss": 0.68350267, + "learning_rate": 9.210427765109942e-07, + "loss": 0.70552278, + "num_input_tokens_seen": 123607385, + "step": 5746, + "time_per_iteration": 2.6284828186035156 + }, + { + "auxiliary_loss_clip": 0.01110787, + "auxiliary_loss_mlp": 0.01084986, + "balance_loss_clip": 1.02357352, + "balance_loss_mlp": 1.00463855, + "epoch": 0.6910358925028558, + "flos": 22561453463040.0, + "grad_norm": 1.9611361924347868, + "language_loss": 0.81574887, + "learning_rate": 9.20386964781402e-07, + "loss": 0.83770657, + "num_input_tokens_seen": 123625405, + "step": 5747, + "time_per_iteration": 2.739431142807007 + }, + { + "auxiliary_loss_clip": 0.01117839, + "auxiliary_loss_mlp": 0.01084653, + "balance_loss_clip": 1.02449846, + "balance_loss_mlp": 1.00454426, + "epoch": 0.6911561353934949, + "flos": 22054107813120.0, + "grad_norm": 2.0089134782659266, + "language_loss": 0.84813714, + "learning_rate": 9.197313168282472e-07, + "loss": 0.87016201, + "num_input_tokens_seen": 123642850, + "step": 5748, + "time_per_iteration": 2.690901279449463 + }, + { + "auxiliary_loss_clip": 0.01125378, + "auxiliary_loss_mlp": 0.0108447, + "balance_loss_clip": 1.02355981, + "balance_loss_mlp": 1.00421858, + "epoch": 0.6912763782841339, + "flos": 24206557386240.0, + "grad_norm": 1.9099687966868595, + "language_loss": 0.72482908, + "learning_rate": 9.190758327509935e-07, + "loss": 0.7469275, + "num_input_tokens_seen": 123661595, + "step": 5749, + "time_per_iteration": 2.723228931427002 + }, + { + "auxiliary_loss_clip": 0.01067258, + "auxiliary_loss_mlp": 0.0087303, + "balance_loss_clip": 1.02089667, + "balance_loss_mlp": 1.00152481, + "epoch": 0.6913966211747731, + "flos": 52329641091840.0, + "grad_norm": 0.9239101581453756, + "language_loss": 0.64458901, + "learning_rate": 9.184205126490767e-07, + "loss": 0.66399187, + "num_input_tokens_seen": 123710490, + "step": 5750, + "time_per_iteration": 3.1694564819335938 + }, + { + "auxiliary_loss_clip": 0.0109164, + "auxiliary_loss_mlp": 0.00873072, + "balance_loss_clip": 1.02072144, + "balance_loss_mlp": 1.00154305, + "epoch": 0.6915168640654121, + "flos": 66741274851840.0, + "grad_norm": 1.1365974577183682, + "language_loss": 0.59702516, + "learning_rate": 9.177653566219075e-07, + "loss": 0.61667228, + "num_input_tokens_seen": 123765215, + "step": 5751, + "time_per_iteration": 3.1962060928344727 + }, + { + "auxiliary_loss_clip": 0.01108835, + "auxiliary_loss_mlp": 0.01084061, + "balance_loss_clip": 1.02356887, + "balance_loss_mlp": 1.00385737, + "epoch": 0.6916371069560512, + "flos": 18296523175680.0, + "grad_norm": 1.9775995054461366, + "language_loss": 0.7608521, + "learning_rate": 9.171103647688744e-07, + "loss": 0.78278106, + "num_input_tokens_seen": 123783955, + "step": 5752, + "time_per_iteration": 2.823302745819092 + }, + { + "auxiliary_loss_clip": 0.01072992, + "auxiliary_loss_mlp": 0.01084018, + "balance_loss_clip": 1.02166307, + "balance_loss_mlp": 1.00395715, + "epoch": 0.6917573498466904, + "flos": 19645794685440.0, + "grad_norm": 1.7162439400932579, + "language_loss": 0.6895324, + "learning_rate": 9.164555371893367e-07, + "loss": 0.71110249, + "num_input_tokens_seen": 123803885, + "step": 5753, + "time_per_iteration": 2.846768617630005 + }, + { + "auxiliary_loss_clip": 0.01124775, + "auxiliary_loss_mlp": 0.00872798, + "balance_loss_clip": 1.02474904, + "balance_loss_mlp": 1.00010812, + "epoch": 0.6918775927373294, + "flos": 14210319985920.0, + "grad_norm": 1.776552133228094, + "language_loss": 0.7535904, + "learning_rate": 9.158008739826333e-07, + "loss": 0.77356613, + "num_input_tokens_seen": 123821485, + "step": 5754, + "time_per_iteration": 2.6755688190460205 + }, + { + "auxiliary_loss_clip": 0.01114215, + "auxiliary_loss_mlp": 0.01083718, + "balance_loss_clip": 1.02320671, + "balance_loss_mlp": 1.00356197, + "epoch": 0.6919978356279685, + "flos": 23985455218560.0, + "grad_norm": 1.790003730119761, + "language_loss": 0.8657124, + "learning_rate": 9.151463752480744e-07, + "loss": 0.88769174, + "num_input_tokens_seen": 123840215, + "step": 5755, + "time_per_iteration": 2.7150020599365234 + }, + { + "auxiliary_loss_clip": 0.01099555, + "auxiliary_loss_mlp": 0.0108402, + "balance_loss_clip": 1.02268124, + "balance_loss_mlp": 1.00386345, + "epoch": 0.6921180785186076, + "flos": 23622937205760.0, + "grad_norm": 1.4959702719089065, + "language_loss": 0.80181479, + "learning_rate": 9.144920410849493e-07, + "loss": 0.8236506, + "num_input_tokens_seen": 123861450, + "step": 5756, + "time_per_iteration": 2.8443098068237305 + }, + { + "auxiliary_loss_clip": 0.0111799, + "auxiliary_loss_mlp": 0.01084792, + "balance_loss_clip": 1.02476192, + "balance_loss_mlp": 1.00468326, + "epoch": 0.6922383214092467, + "flos": 21142623265920.0, + "grad_norm": 1.639398159083213, + "language_loss": 0.8043046, + "learning_rate": 9.138378715925176e-07, + "loss": 0.82633239, + "num_input_tokens_seen": 123880545, + "step": 5757, + "time_per_iteration": 2.904968500137329 + }, + { + "auxiliary_loss_clip": 0.01115321, + "auxiliary_loss_mlp": 0.01083821, + "balance_loss_clip": 1.02356982, + "balance_loss_mlp": 1.00371206, + "epoch": 0.6923585642998857, + "flos": 21470667200640.0, + "grad_norm": 1.5707782035472029, + "language_loss": 0.80702233, + "learning_rate": 9.131838668700167e-07, + "loss": 0.82901371, + "num_input_tokens_seen": 123900615, + "step": 5758, + "time_per_iteration": 2.732490301132202 + }, + { + "auxiliary_loss_clip": 0.01110517, + "auxiliary_loss_mlp": 0.0108463, + "balance_loss_clip": 1.02474403, + "balance_loss_mlp": 1.004426, + "epoch": 0.6924788071905249, + "flos": 21105204272640.0, + "grad_norm": 1.725575017654541, + "language_loss": 0.8642953, + "learning_rate": 9.125300270166598e-07, + "loss": 0.8862468, + "num_input_tokens_seen": 123921220, + "step": 5759, + "time_per_iteration": 3.6365644931793213 + }, + { + "auxiliary_loss_clip": 0.01091745, + "auxiliary_loss_mlp": 0.01083563, + "balance_loss_clip": 1.02316511, + "balance_loss_mlp": 1.00340688, + "epoch": 0.692599050081164, + "flos": 26250018117120.0, + "grad_norm": 1.8070464282009384, + "language_loss": 0.85950553, + "learning_rate": 9.118763521316324e-07, + "loss": 0.88125861, + "num_input_tokens_seen": 123941795, + "step": 5760, + "time_per_iteration": 2.7779557704925537 + }, + { + "auxiliary_loss_clip": 0.01134632, + "auxiliary_loss_mlp": 0.00872898, + "balance_loss_clip": 1.02470422, + "balance_loss_mlp": 1.0000639, + "epoch": 0.692719292971803, + "flos": 20885215426560.0, + "grad_norm": 1.536305991838895, + "language_loss": 0.7588259, + "learning_rate": 9.112228423140987e-07, + "loss": 0.77890122, + "num_input_tokens_seen": 123960715, + "step": 5761, + "time_per_iteration": 2.635158061981201 + }, + { + "auxiliary_loss_clip": 0.01116691, + "auxiliary_loss_mlp": 0.01084767, + "balance_loss_clip": 1.02379918, + "balance_loss_mlp": 1.00456333, + "epoch": 0.6928395358624422, + "flos": 25921938268800.0, + "grad_norm": 2.2293658208616423, + "language_loss": 0.86479753, + "learning_rate": 9.105694976631932e-07, + "loss": 0.88681215, + "num_input_tokens_seen": 123978625, + "step": 5762, + "time_per_iteration": 2.7058591842651367 + }, + { + "auxiliary_loss_clip": 0.01125674, + "auxiliary_loss_mlp": 0.01083775, + "balance_loss_clip": 1.02550602, + "balance_loss_mlp": 1.0036186, + "epoch": 0.6929597787530812, + "flos": 23586559706880.0, + "grad_norm": 1.9285360067775916, + "language_loss": 0.72550774, + "learning_rate": 9.099163182780283e-07, + "loss": 0.74760222, + "num_input_tokens_seen": 123996780, + "step": 5763, + "time_per_iteration": 5.416718244552612 + }, + { + "auxiliary_loss_clip": 0.011083, + "auxiliary_loss_mlp": 0.01084394, + "balance_loss_clip": 1.02350605, + "balance_loss_mlp": 1.00414228, + "epoch": 0.6930800216437203, + "flos": 18255656476800.0, + "grad_norm": 2.244030862265421, + "language_loss": 0.48716742, + "learning_rate": 9.092633042576916e-07, + "loss": 0.5090943, + "num_input_tokens_seen": 124014045, + "step": 5764, + "time_per_iteration": 2.7043228149414062 + }, + { + "auxiliary_loss_clip": 0.01113961, + "auxiliary_loss_mlp": 0.01084304, + "balance_loss_clip": 1.02365661, + "balance_loss_mlp": 1.00414777, + "epoch": 0.6932002645343595, + "flos": 29168621809920.0, + "grad_norm": 1.6496901098965169, + "language_loss": 0.55817723, + "learning_rate": 9.086104557012446e-07, + "loss": 0.5801599, + "num_input_tokens_seen": 124034615, + "step": 5765, + "time_per_iteration": 2.7635574340820312 + }, + { + "auxiliary_loss_clip": 0.01125764, + "auxiliary_loss_mlp": 0.01085598, + "balance_loss_clip": 1.02430558, + "balance_loss_mlp": 1.00548959, + "epoch": 0.6933205074249985, + "flos": 23842746483840.0, + "grad_norm": 1.936165712384101, + "language_loss": 0.65499955, + "learning_rate": 9.079577727077239e-07, + "loss": 0.67711318, + "num_input_tokens_seen": 124053445, + "step": 5766, + "time_per_iteration": 2.689750909805298 + }, + { + "auxiliary_loss_clip": 0.01125387, + "auxiliary_loss_mlp": 0.01084466, + "balance_loss_clip": 1.02503741, + "balance_loss_mlp": 1.00421476, + "epoch": 0.6934407503156376, + "flos": 24166696268160.0, + "grad_norm": 2.3702643864423227, + "language_loss": 0.7181251, + "learning_rate": 9.073052553761404e-07, + "loss": 0.74022365, + "num_input_tokens_seen": 124072810, + "step": 5767, + "time_per_iteration": 2.712674379348755 + }, + { + "auxiliary_loss_clip": 0.01098672, + "auxiliary_loss_mlp": 0.01085019, + "balance_loss_clip": 1.02333057, + "balance_loss_mlp": 1.00481462, + "epoch": 0.6935609932062767, + "flos": 20631327120000.0, + "grad_norm": 1.562742619203162, + "language_loss": 0.7816149, + "learning_rate": 9.066529038054805e-07, + "loss": 0.80345178, + "num_input_tokens_seen": 124092875, + "step": 5768, + "time_per_iteration": 2.739051580429077 + }, + { + "auxiliary_loss_clip": 0.01116765, + "auxiliary_loss_mlp": 0.01083762, + "balance_loss_clip": 1.0244174, + "balance_loss_mlp": 1.00360537, + "epoch": 0.6936812360969158, + "flos": 18254184019200.0, + "grad_norm": 1.688103778877924, + "language_loss": 0.73858285, + "learning_rate": 9.060007180947071e-07, + "loss": 0.76058805, + "num_input_tokens_seen": 124110930, + "step": 5769, + "time_per_iteration": 2.6552066802978516 + }, + { + "auxiliary_loss_clip": 0.01083295, + "auxiliary_loss_mlp": 0.01083764, + "balance_loss_clip": 1.02361596, + "balance_loss_mlp": 1.00351262, + "epoch": 0.6938014789875548, + "flos": 31317336368640.0, + "grad_norm": 1.9905588219329775, + "language_loss": 0.73074186, + "learning_rate": 9.053486983427534e-07, + "loss": 0.75241244, + "num_input_tokens_seen": 124132180, + "step": 5770, + "time_per_iteration": 2.8825266361236572 + }, + { + "auxiliary_loss_clip": 0.01116035, + "auxiliary_loss_mlp": 0.01084628, + "balance_loss_clip": 1.02301812, + "balance_loss_mlp": 1.00442386, + "epoch": 0.6939217218781939, + "flos": 17528429721600.0, + "grad_norm": 1.8595424428376837, + "language_loss": 0.70225394, + "learning_rate": 9.046968446485326e-07, + "loss": 0.72426057, + "num_input_tokens_seen": 124150585, + "step": 5771, + "time_per_iteration": 2.734373092651367 + }, + { + "auxiliary_loss_clip": 0.01125582, + "auxiliary_loss_mlp": 0.01083519, + "balance_loss_clip": 1.02490854, + "balance_loss_mlp": 1.00326741, + "epoch": 0.6940419647688331, + "flos": 18551776199040.0, + "grad_norm": 2.203413690945253, + "language_loss": 0.70569324, + "learning_rate": 9.040451571109295e-07, + "loss": 0.72778428, + "num_input_tokens_seen": 124166205, + "step": 5772, + "time_per_iteration": 2.5923757553100586 + }, + { + "auxiliary_loss_clip": 0.01082354, + "auxiliary_loss_mlp": 0.01079541, + "balance_loss_clip": 1.01154685, + "balance_loss_mlp": 1.00057638, + "epoch": 0.6941622076594721, + "flos": 66926286829440.0, + "grad_norm": 0.8265154750706414, + "language_loss": 0.60391915, + "learning_rate": 9.033936358288042e-07, + "loss": 0.62553811, + "num_input_tokens_seen": 124219940, + "step": 5773, + "time_per_iteration": 3.199712038040161 + }, + { + "auxiliary_loss_clip": 0.01134491, + "auxiliary_loss_mlp": 0.01083726, + "balance_loss_clip": 1.02497721, + "balance_loss_mlp": 1.00357008, + "epoch": 0.6942824505501112, + "flos": 26578062051840.0, + "grad_norm": 1.8687117316061987, + "language_loss": 0.82240355, + "learning_rate": 9.027422809009937e-07, + "loss": 0.84458572, + "num_input_tokens_seen": 124239885, + "step": 5774, + "time_per_iteration": 2.64913272857666 + }, + { + "auxiliary_loss_clip": 0.01126215, + "auxiliary_loss_mlp": 0.01084564, + "balance_loss_clip": 1.02452087, + "balance_loss_mlp": 1.00426471, + "epoch": 0.6944026934407503, + "flos": 21248308056960.0, + "grad_norm": 1.8310446352459884, + "language_loss": 0.83612955, + "learning_rate": 9.020910924263054e-07, + "loss": 0.85823739, + "num_input_tokens_seen": 124258410, + "step": 5775, + "time_per_iteration": 2.771888494491577 + }, + { + "auxiliary_loss_clip": 0.01081512, + "auxiliary_loss_mlp": 0.01079914, + "balance_loss_clip": 1.01100636, + "balance_loss_mlp": 1.00094986, + "epoch": 0.6945229363313894, + "flos": 70677191537280.0, + "grad_norm": 0.8297082097196408, + "language_loss": 0.58187044, + "learning_rate": 9.014400705035261e-07, + "loss": 0.60348475, + "num_input_tokens_seen": 124315315, + "step": 5776, + "time_per_iteration": 3.337813138961792 + }, + { + "auxiliary_loss_clip": 0.01135678, + "auxiliary_loss_mlp": 0.01084441, + "balance_loss_clip": 1.02661693, + "balance_loss_mlp": 1.00423694, + "epoch": 0.6946431792220285, + "flos": 18952934267520.0, + "grad_norm": 1.9190880458339292, + "language_loss": 0.76481342, + "learning_rate": 9.00789215231414e-07, + "loss": 0.7870146, + "num_input_tokens_seen": 124333710, + "step": 5777, + "time_per_iteration": 2.6112728118896484 + }, + { + "auxiliary_loss_clip": 0.0110984, + "auxiliary_loss_mlp": 0.00872848, + "balance_loss_clip": 1.02378249, + "balance_loss_mlp": 1.00014377, + "epoch": 0.6947634221126676, + "flos": 20338834671360.0, + "grad_norm": 1.6752943936967124, + "language_loss": 0.82057559, + "learning_rate": 9.001385267087056e-07, + "loss": 0.84040248, + "num_input_tokens_seen": 124352855, + "step": 5778, + "time_per_iteration": 2.7837815284729004 + }, + { + "auxiliary_loss_clip": 0.01126175, + "auxiliary_loss_mlp": 0.01083535, + "balance_loss_clip": 1.02478731, + "balance_loss_mlp": 1.00342631, + "epoch": 0.6948836650033067, + "flos": 21833723917440.0, + "grad_norm": 1.4524425375601209, + "language_loss": 0.70354545, + "learning_rate": 8.994880050341072e-07, + "loss": 0.72564256, + "num_input_tokens_seen": 124372960, + "step": 5779, + "time_per_iteration": 2.6959712505340576 + }, + { + "auxiliary_loss_clip": 0.01120269, + "auxiliary_loss_mlp": 0.0108517, + "balance_loss_clip": 1.02662933, + "balance_loss_mlp": 1.00496602, + "epoch": 0.6950039078939457, + "flos": 23657519024640.0, + "grad_norm": 2.0459519592718896, + "language_loss": 0.77537429, + "learning_rate": 8.988376503063026e-07, + "loss": 0.79742873, + "num_input_tokens_seen": 124394220, + "step": 5780, + "time_per_iteration": 2.737915277481079 + }, + { + "auxiliary_loss_clip": 0.01097043, + "auxiliary_loss_mlp": 0.01084597, + "balance_loss_clip": 1.02217901, + "balance_loss_mlp": 1.00439346, + "epoch": 0.6951241507845849, + "flos": 21792462168960.0, + "grad_norm": 1.787389039480526, + "language_loss": 0.8179161, + "learning_rate": 8.981874626239521e-07, + "loss": 0.83973253, + "num_input_tokens_seen": 124412795, + "step": 5781, + "time_per_iteration": 2.7513644695281982 + }, + { + "auxiliary_loss_clip": 0.01125926, + "auxiliary_loss_mlp": 0.01084927, + "balance_loss_clip": 1.02521086, + "balance_loss_mlp": 1.00472271, + "epoch": 0.695244393675224, + "flos": 14647568244480.0, + "grad_norm": 2.4038979467433803, + "language_loss": 0.88118911, + "learning_rate": 8.975374420856872e-07, + "loss": 0.9032976, + "num_input_tokens_seen": 124429690, + "step": 5782, + "time_per_iteration": 2.7013301849365234 + }, + { + "auxiliary_loss_clip": 0.01107999, + "auxiliary_loss_mlp": 0.01084808, + "balance_loss_clip": 1.02342057, + "balance_loss_mlp": 1.00469935, + "epoch": 0.695364636565863, + "flos": 16873203778560.0, + "grad_norm": 2.1237032552594055, + "language_loss": 0.72690749, + "learning_rate": 8.968875887901157e-07, + "loss": 0.74883556, + "num_input_tokens_seen": 124447070, + "step": 5783, + "time_per_iteration": 2.749812364578247 + }, + { + "auxiliary_loss_clip": 0.01118186, + "auxiliary_loss_mlp": 0.0108497, + "balance_loss_clip": 1.02485132, + "balance_loss_mlp": 1.00476646, + "epoch": 0.6954848794565022, + "flos": 19354523299200.0, + "grad_norm": 2.3645714068353967, + "language_loss": 0.6279375, + "learning_rate": 8.9623790283582e-07, + "loss": 0.64996904, + "num_input_tokens_seen": 124464950, + "step": 5784, + "time_per_iteration": 2.688356876373291 + }, + { + "auxiliary_loss_clip": 0.01107217, + "auxiliary_loss_mlp": 0.01085409, + "balance_loss_clip": 1.02323282, + "balance_loss_mlp": 1.00506234, + "epoch": 0.6956051223471412, + "flos": 18990209606400.0, + "grad_norm": 1.980043148136448, + "language_loss": 0.76144892, + "learning_rate": 8.955883843213561e-07, + "loss": 0.7833752, + "num_input_tokens_seen": 124483965, + "step": 5785, + "time_per_iteration": 3.7506332397460938 + }, + { + "auxiliary_loss_clip": 0.01125841, + "auxiliary_loss_mlp": 0.01083769, + "balance_loss_clip": 1.02413487, + "balance_loss_mlp": 1.00351703, + "epoch": 0.6957253652377803, + "flos": 16107229226880.0, + "grad_norm": 2.1021214790624763, + "language_loss": 0.86883008, + "learning_rate": 8.949390333452569e-07, + "loss": 0.89092612, + "num_input_tokens_seen": 124501910, + "step": 5786, + "time_per_iteration": 2.5960824489593506 + }, + { + "auxiliary_loss_clip": 0.01136249, + "auxiliary_loss_mlp": 0.01083726, + "balance_loss_clip": 1.02709174, + "balance_loss_mlp": 1.00361776, + "epoch": 0.6958456081284194, + "flos": 29388646569600.0, + "grad_norm": 1.7349598262952919, + "language_loss": 0.67767024, + "learning_rate": 8.942898500060279e-07, + "loss": 0.69986999, + "num_input_tokens_seen": 124521625, + "step": 5787, + "time_per_iteration": 2.676942825317383 + }, + { + "auxiliary_loss_clip": 0.01081796, + "auxiliary_loss_mlp": 0.0108567, + "balance_loss_clip": 1.02350307, + "balance_loss_mlp": 1.00541854, + "epoch": 0.6959658510190585, + "flos": 25154850395520.0, + "grad_norm": 3.454649359736964, + "language_loss": 0.71953821, + "learning_rate": 8.936408344021493e-07, + "loss": 0.7412129, + "num_input_tokens_seen": 124538540, + "step": 5788, + "time_per_iteration": 4.650408506393433 + }, + { + "auxiliary_loss_clip": 0.01102019, + "auxiliary_loss_mlp": 0.01084926, + "balance_loss_clip": 1.02590561, + "balance_loss_mlp": 1.00467432, + "epoch": 0.6960860939096976, + "flos": 42814388759040.0, + "grad_norm": 1.9357536512477644, + "language_loss": 0.71056503, + "learning_rate": 8.929919866320765e-07, + "loss": 0.73243451, + "num_input_tokens_seen": 124559355, + "step": 5789, + "time_per_iteration": 3.726850748062134 + }, + { + "auxiliary_loss_clip": 0.01108733, + "auxiliary_loss_mlp": 0.00872963, + "balance_loss_clip": 1.0240283, + "balance_loss_mlp": 1.00010896, + "epoch": 0.6962063368003367, + "flos": 17566566986880.0, + "grad_norm": 2.1159385634229944, + "language_loss": 0.81885052, + "learning_rate": 8.923433067942385e-07, + "loss": 0.83866751, + "num_input_tokens_seen": 124577920, + "step": 5790, + "time_per_iteration": 2.754922389984131 + }, + { + "auxiliary_loss_clip": 0.01092923, + "auxiliary_loss_mlp": 0.01083866, + "balance_loss_clip": 1.02480936, + "balance_loss_mlp": 1.00370967, + "epoch": 0.6963265796909758, + "flos": 21251648021760.0, + "grad_norm": 1.8224228839343284, + "language_loss": 0.68497592, + "learning_rate": 8.916947949870417e-07, + "loss": 0.70674378, + "num_input_tokens_seen": 124597585, + "step": 5791, + "time_per_iteration": 2.7596423625946045 + }, + { + "auxiliary_loss_clip": 0.01108434, + "auxiliary_loss_mlp": 0.01079559, + "balance_loss_clip": 1.02075481, + "balance_loss_mlp": 1.00059474, + "epoch": 0.6964468225816148, + "flos": 68828295801600.0, + "grad_norm": 0.7420742929930073, + "language_loss": 0.58123469, + "learning_rate": 8.910464513088615e-07, + "loss": 0.6031146, + "num_input_tokens_seen": 124661625, + "step": 5792, + "time_per_iteration": 3.3024497032165527 + }, + { + "auxiliary_loss_clip": 0.01115779, + "auxiliary_loss_mlp": 0.01083972, + "balance_loss_clip": 1.02371192, + "balance_loss_mlp": 1.0037204, + "epoch": 0.696567065472254, + "flos": 18950887192320.0, + "grad_norm": 1.7635048639184578, + "language_loss": 0.78127551, + "learning_rate": 8.903982758580542e-07, + "loss": 0.80327302, + "num_input_tokens_seen": 124680565, + "step": 5793, + "time_per_iteration": 2.7473955154418945 + }, + { + "auxiliary_loss_clip": 0.01113432, + "auxiliary_loss_mlp": 0.01085693, + "balance_loss_clip": 1.02232075, + "balance_loss_mlp": 1.00553644, + "epoch": 0.696687308362893, + "flos": 22856675345280.0, + "grad_norm": 2.2320720974975856, + "language_loss": 0.80732155, + "learning_rate": 8.897502687329457e-07, + "loss": 0.8293128, + "num_input_tokens_seen": 124700365, + "step": 5794, + "time_per_iteration": 2.7349252700805664 + }, + { + "auxiliary_loss_clip": 0.01108803, + "auxiliary_loss_mlp": 0.01084735, + "balance_loss_clip": 1.02491951, + "balance_loss_mlp": 1.00462651, + "epoch": 0.6968075512535321, + "flos": 24972926987520.0, + "grad_norm": 1.8208310855043568, + "language_loss": 0.80074304, + "learning_rate": 8.891024300318382e-07, + "loss": 0.82267845, + "num_input_tokens_seen": 124718935, + "step": 5795, + "time_per_iteration": 2.9267330169677734 + }, + { + "auxiliary_loss_clip": 0.01109407, + "auxiliary_loss_mlp": 0.01085003, + "balance_loss_clip": 1.02483773, + "balance_loss_mlp": 1.00494182, + "epoch": 0.6969277941441713, + "flos": 21030438113280.0, + "grad_norm": 2.248716848532902, + "language_loss": 0.75928783, + "learning_rate": 8.884547598530103e-07, + "loss": 0.78123194, + "num_input_tokens_seen": 124739505, + "step": 5796, + "time_per_iteration": 2.8261559009552 + }, + { + "auxiliary_loss_clip": 0.01072831, + "auxiliary_loss_mlp": 0.01084656, + "balance_loss_clip": 1.02098143, + "balance_loss_mlp": 1.00445223, + "epoch": 0.6970480370348103, + "flos": 21579404647680.0, + "grad_norm": 1.6677627010366947, + "language_loss": 0.75150406, + "learning_rate": 8.8780725829471e-07, + "loss": 0.77307892, + "num_input_tokens_seen": 124757410, + "step": 5797, + "time_per_iteration": 2.863889455795288 + }, + { + "auxiliary_loss_clip": 0.01134926, + "auxiliary_loss_mlp": 0.01084025, + "balance_loss_clip": 1.02511764, + "balance_loss_mlp": 1.00386882, + "epoch": 0.6971682799254494, + "flos": 22419175691520.0, + "grad_norm": 1.958969994493495, + "language_loss": 0.77991527, + "learning_rate": 8.87159925455165e-07, + "loss": 0.80210483, + "num_input_tokens_seen": 124777240, + "step": 5798, + "time_per_iteration": 2.6740686893463135 + }, + { + "auxiliary_loss_clip": 0.01107168, + "auxiliary_loss_mlp": 0.01085301, + "balance_loss_clip": 1.0235461, + "balance_loss_mlp": 1.00500178, + "epoch": 0.6972885228160886, + "flos": 20005834659840.0, + "grad_norm": 1.8503585914582916, + "language_loss": 0.73563421, + "learning_rate": 8.865127614325738e-07, + "loss": 0.75755894, + "num_input_tokens_seen": 124795670, + "step": 5799, + "time_per_iteration": 2.7635586261749268 + }, + { + "auxiliary_loss_clip": 0.01118534, + "auxiliary_loss_mlp": 0.01084099, + "balance_loss_clip": 1.02507687, + "balance_loss_mlp": 1.00389469, + "epoch": 0.6974087657067276, + "flos": 37853437656960.0, + "grad_norm": 2.799764419373485, + "language_loss": 0.67359293, + "learning_rate": 8.85865766325113e-07, + "loss": 0.69561934, + "num_input_tokens_seen": 124819600, + "step": 5800, + "time_per_iteration": 2.892824411392212 + }, + { + "auxiliary_loss_clip": 0.01118643, + "auxiliary_loss_mlp": 0.01083308, + "balance_loss_clip": 1.02529442, + "balance_loss_mlp": 1.00300884, + "epoch": 0.6975290085973667, + "flos": 29489267543040.0, + "grad_norm": 2.6384930099897437, + "language_loss": 0.72071958, + "learning_rate": 8.852189402309287e-07, + "loss": 0.74273908, + "num_input_tokens_seen": 124838785, + "step": 5801, + "time_per_iteration": 2.7861976623535156 + }, + { + "auxiliary_loss_clip": 0.01119182, + "auxiliary_loss_mlp": 0.0108379, + "balance_loss_clip": 1.02432084, + "balance_loss_mlp": 1.00353837, + "epoch": 0.6976492514880057, + "flos": 12895630295040.0, + "grad_norm": 2.145546800603557, + "language_loss": 0.74056351, + "learning_rate": 8.845722832481441e-07, + "loss": 0.76259327, + "num_input_tokens_seen": 124854215, + "step": 5802, + "time_per_iteration": 2.6566731929779053 + }, + { + "auxiliary_loss_clip": 0.01124412, + "auxiliary_loss_mlp": 0.01084703, + "balance_loss_clip": 1.02476645, + "balance_loss_mlp": 1.00449896, + "epoch": 0.6977694943786449, + "flos": 24352929308160.0, + "grad_norm": 1.9806731203675623, + "language_loss": 0.77311975, + "learning_rate": 8.83925795474858e-07, + "loss": 0.7952109, + "num_input_tokens_seen": 124874340, + "step": 5803, + "time_per_iteration": 2.658417224884033 + }, + { + "auxiliary_loss_clip": 0.01096702, + "auxiliary_loss_mlp": 0.01083829, + "balance_loss_clip": 1.02144229, + "balance_loss_mlp": 1.00362468, + "epoch": 0.6978897372692839, + "flos": 29898470257920.0, + "grad_norm": 2.6970512754844473, + "language_loss": 0.58870852, + "learning_rate": 8.832794770091414e-07, + "loss": 0.61051381, + "num_input_tokens_seen": 124895175, + "step": 5804, + "time_per_iteration": 2.817822217941284 + }, + { + "auxiliary_loss_clip": 0.01115869, + "auxiliary_loss_mlp": 0.01085048, + "balance_loss_clip": 1.02303147, + "balance_loss_mlp": 1.00474894, + "epoch": 0.698009980159923, + "flos": 21761579450880.0, + "grad_norm": 1.9664849526916508, + "language_loss": 0.82528847, + "learning_rate": 8.826333279490401e-07, + "loss": 0.84729767, + "num_input_tokens_seen": 124915810, + "step": 5805, + "time_per_iteration": 2.7291855812072754 + }, + { + "auxiliary_loss_clip": 0.01117603, + "auxiliary_loss_mlp": 0.01084389, + "balance_loss_clip": 1.02549016, + "balance_loss_mlp": 1.00418484, + "epoch": 0.6981302230505622, + "flos": 19857164267520.0, + "grad_norm": 1.984607401966305, + "language_loss": 0.68357611, + "learning_rate": 8.819873483925748e-07, + "loss": 0.70559597, + "num_input_tokens_seen": 124932930, + "step": 5806, + "time_per_iteration": 2.71893048286438 + }, + { + "auxiliary_loss_clip": 0.01107665, + "auxiliary_loss_mlp": 0.0087289, + "balance_loss_clip": 1.02427351, + "balance_loss_mlp": 1.00006497, + "epoch": 0.6982504659412012, + "flos": 22198648141440.0, + "grad_norm": 1.8848269318345048, + "language_loss": 0.74333203, + "learning_rate": 8.81341538437739e-07, + "loss": 0.76313752, + "num_input_tokens_seen": 124951220, + "step": 5807, + "time_per_iteration": 2.8101353645324707 + }, + { + "auxiliary_loss_clip": 0.01116773, + "auxiliary_loss_mlp": 0.01084331, + "balance_loss_clip": 1.02415466, + "balance_loss_mlp": 1.00417447, + "epoch": 0.6983707088318403, + "flos": 35588479708800.0, + "grad_norm": 1.8481268754577214, + "language_loss": 0.68000597, + "learning_rate": 8.80695898182503e-07, + "loss": 0.70201695, + "num_input_tokens_seen": 124972200, + "step": 5808, + "time_per_iteration": 2.806323289871216 + }, + { + "auxiliary_loss_clip": 0.01097836, + "auxiliary_loss_mlp": 0.01079442, + "balance_loss_clip": 1.01119339, + "balance_loss_mlp": 1.00047815, + "epoch": 0.6984909517224794, + "flos": 65440052760960.0, + "grad_norm": 0.8197499513574024, + "language_loss": 0.6512655, + "learning_rate": 8.800504277248093e-07, + "loss": 0.67303824, + "num_input_tokens_seen": 125036950, + "step": 5809, + "time_per_iteration": 3.2632570266723633 + }, + { + "auxiliary_loss_clip": 0.0110106, + "auxiliary_loss_mlp": 0.00872823, + "balance_loss_clip": 1.02051377, + "balance_loss_mlp": 1.00015295, + "epoch": 0.6986111946131185, + "flos": 18546927863040.0, + "grad_norm": 1.5556134626841527, + "language_loss": 0.75086784, + "learning_rate": 8.794051271625753e-07, + "loss": 0.77060664, + "num_input_tokens_seen": 125054585, + "step": 5810, + "time_per_iteration": 3.7013983726501465 + }, + { + "auxiliary_loss_clip": 0.01115705, + "auxiliary_loss_mlp": 0.01084687, + "balance_loss_clip": 1.02367616, + "balance_loss_mlp": 1.00457835, + "epoch": 0.6987314375037575, + "flos": 23039173370880.0, + "grad_norm": 1.5521047029500308, + "language_loss": 0.83429432, + "learning_rate": 8.787599965936925e-07, + "loss": 0.85629821, + "num_input_tokens_seen": 125075515, + "step": 5811, + "time_per_iteration": 2.7410290241241455 + }, + { + "auxiliary_loss_clip": 0.0110623, + "auxiliary_loss_mlp": 0.01083776, + "balance_loss_clip": 1.02307749, + "balance_loss_mlp": 1.0037154, + "epoch": 0.6988516803943967, + "flos": 38400393029760.0, + "grad_norm": 1.7113360514577667, + "language_loss": 0.71888298, + "learning_rate": 8.781150361160261e-07, + "loss": 0.74078304, + "num_input_tokens_seen": 125097425, + "step": 5812, + "time_per_iteration": 2.9373440742492676 + }, + { + "auxiliary_loss_clip": 0.01086359, + "auxiliary_loss_mlp": 0.01084602, + "balance_loss_clip": 1.02467895, + "balance_loss_mlp": 1.00430274, + "epoch": 0.6989719232850358, + "flos": 24096993926400.0, + "grad_norm": 1.5811123448767008, + "language_loss": 0.73753929, + "learning_rate": 8.774702458274181e-07, + "loss": 0.75924891, + "num_input_tokens_seen": 125117830, + "step": 5813, + "time_per_iteration": 3.8118066787719727 + }, + { + "auxiliary_loss_clip": 0.01121008, + "auxiliary_loss_mlp": 0.01083608, + "balance_loss_clip": 1.02154541, + "balance_loss_mlp": 1.00335646, + "epoch": 0.6990921661756748, + "flos": 14866838818560.0, + "grad_norm": 4.53484689546653, + "language_loss": 0.70689619, + "learning_rate": 8.768256258256799e-07, + "loss": 0.72894239, + "num_input_tokens_seen": 125134455, + "step": 5814, + "time_per_iteration": 3.5205819606781006 + }, + { + "auxiliary_loss_clip": 0.01129145, + "auxiliary_loss_mlp": 0.01083377, + "balance_loss_clip": 1.0271666, + "balance_loss_mlp": 1.00322104, + "epoch": 0.699212409066314, + "flos": 20193719725440.0, + "grad_norm": 1.8333133114658278, + "language_loss": 0.73818558, + "learning_rate": 8.76181176208602e-07, + "loss": 0.76031083, + "num_input_tokens_seen": 125152555, + "step": 5815, + "time_per_iteration": 2.665639638900757 + }, + { + "auxiliary_loss_clip": 0.01096154, + "auxiliary_loss_mlp": 0.01084275, + "balance_loss_clip": 1.02145433, + "balance_loss_mlp": 1.00402379, + "epoch": 0.699332651956953, + "flos": 19427888828160.0, + "grad_norm": 1.9451103217958399, + "language_loss": 0.73292732, + "learning_rate": 8.755368970739461e-07, + "loss": 0.75473154, + "num_input_tokens_seen": 125171915, + "step": 5816, + "time_per_iteration": 2.789423942565918 + }, + { + "auxiliary_loss_clip": 0.01108187, + "auxiliary_loss_mlp": 0.01084391, + "balance_loss_clip": 1.02328253, + "balance_loss_mlp": 1.00413918, + "epoch": 0.6994528948475921, + "flos": 16143714466560.0, + "grad_norm": 2.8231816568967636, + "language_loss": 0.61775178, + "learning_rate": 8.748927885194479e-07, + "loss": 0.63967758, + "num_input_tokens_seen": 125190220, + "step": 5817, + "time_per_iteration": 2.798353672027588 + }, + { + "auxiliary_loss_clip": 0.01091153, + "auxiliary_loss_mlp": 0.01079745, + "balance_loss_clip": 1.02952528, + "balance_loss_mlp": 1.0007813, + "epoch": 0.6995731377382313, + "flos": 64952420699520.0, + "grad_norm": 0.7923618865279766, + "language_loss": 0.57447219, + "learning_rate": 8.742488506428209e-07, + "loss": 0.59618115, + "num_input_tokens_seen": 125249310, + "step": 5818, + "time_per_iteration": 3.25555419921875 + }, + { + "auxiliary_loss_clip": 0.0111659, + "auxiliary_loss_mlp": 0.00872936, + "balance_loss_clip": 1.02395892, + "balance_loss_mlp": 1.0001142, + "epoch": 0.6996933806288703, + "flos": 24900136076160.0, + "grad_norm": 1.736638478555462, + "language_loss": 0.78214312, + "learning_rate": 8.736050835417466e-07, + "loss": 0.80203837, + "num_input_tokens_seen": 125269350, + "step": 5819, + "time_per_iteration": 2.7515571117401123 + }, + { + "auxiliary_loss_clip": 0.01127262, + "auxiliary_loss_mlp": 0.01083835, + "balance_loss_clip": 1.02570629, + "balance_loss_mlp": 1.00363135, + "epoch": 0.6998136235195094, + "flos": 20777806782720.0, + "grad_norm": 2.0894947325258415, + "language_loss": 0.61511612, + "learning_rate": 8.729614873138862e-07, + "loss": 0.63722706, + "num_input_tokens_seen": 125286985, + "step": 5820, + "time_per_iteration": 2.788642406463623 + }, + { + "auxiliary_loss_clip": 0.0109651, + "auxiliary_loss_mlp": 0.01084341, + "balance_loss_clip": 1.02269495, + "balance_loss_mlp": 1.00413752, + "epoch": 0.6999338664101485, + "flos": 23733470332800.0, + "grad_norm": 2.3221176071524914, + "language_loss": 0.77484959, + "learning_rate": 8.723180620568716e-07, + "loss": 0.7966581, + "num_input_tokens_seen": 125306240, + "step": 5821, + "time_per_iteration": 2.8345930576324463 + }, + { + "auxiliary_loss_clip": 0.01101952, + "auxiliary_loss_mlp": 0.01083875, + "balance_loss_clip": 1.02559257, + "balance_loss_mlp": 1.00376582, + "epoch": 0.7000541093007876, + "flos": 19864598382720.0, + "grad_norm": 1.8966919264619746, + "language_loss": 0.85019255, + "learning_rate": 8.716748078683116e-07, + "loss": 0.87205082, + "num_input_tokens_seen": 125323015, + "step": 5822, + "time_per_iteration": 2.7437829971313477 + }, + { + "auxiliary_loss_clip": 0.01072234, + "auxiliary_loss_mlp": 0.01085787, + "balance_loss_clip": 1.02014112, + "balance_loss_mlp": 1.00548768, + "epoch": 0.7001743521914267, + "flos": 29679056029440.0, + "grad_norm": 2.051102445619577, + "language_loss": 0.68562979, + "learning_rate": 8.710317248457855e-07, + "loss": 0.70720994, + "num_input_tokens_seen": 125342630, + "step": 5823, + "time_per_iteration": 2.9094736576080322 + }, + { + "auxiliary_loss_clip": 0.01109749, + "auxiliary_loss_mlp": 0.01082938, + "balance_loss_clip": 1.02464509, + "balance_loss_mlp": 1.00278139, + "epoch": 0.7002945950820658, + "flos": 27489762080640.0, + "grad_norm": 1.754023115465778, + "language_loss": 0.72096324, + "learning_rate": 8.703888130868482e-07, + "loss": 0.74289006, + "num_input_tokens_seen": 125364480, + "step": 5824, + "time_per_iteration": 2.7762272357940674 + }, + { + "auxiliary_loss_clip": 0.01106385, + "auxiliary_loss_mlp": 0.01085031, + "balance_loss_clip": 1.02390409, + "balance_loss_mlp": 1.00492239, + "epoch": 0.7004148379727049, + "flos": 22158463800960.0, + "grad_norm": 2.4416014646184365, + "language_loss": 0.82307708, + "learning_rate": 8.697460726890307e-07, + "loss": 0.84499121, + "num_input_tokens_seen": 125381625, + "step": 5825, + "time_per_iteration": 2.7750205993652344 + }, + { + "auxiliary_loss_clip": 0.0110952, + "auxiliary_loss_mlp": 0.00872893, + "balance_loss_clip": 1.02383208, + "balance_loss_mlp": 1.00009394, + "epoch": 0.7005350808633439, + "flos": 19423758764160.0, + "grad_norm": 2.134179151226139, + "language_loss": 0.90765917, + "learning_rate": 8.691035037498354e-07, + "loss": 0.92748332, + "num_input_tokens_seen": 125397615, + "step": 5826, + "time_per_iteration": 2.7777822017669678 + }, + { + "auxiliary_loss_clip": 0.01117812, + "auxiliary_loss_mlp": 0.01083977, + "balance_loss_clip": 1.02429271, + "balance_loss_mlp": 1.00386846, + "epoch": 0.7006553237539831, + "flos": 23476708938240.0, + "grad_norm": 1.6868695587531572, + "language_loss": 0.72372949, + "learning_rate": 8.684611063667391e-07, + "loss": 0.74574745, + "num_input_tokens_seen": 125418080, + "step": 5827, + "time_per_iteration": 2.824657678604126 + }, + { + "auxiliary_loss_clip": 0.01126097, + "auxiliary_loss_mlp": 0.01084394, + "balance_loss_clip": 1.02459669, + "balance_loss_mlp": 1.00423765, + "epoch": 0.7007755666446221, + "flos": 31212872640000.0, + "grad_norm": 2.2381869489035298, + "language_loss": 0.76936275, + "learning_rate": 8.678188806371935e-07, + "loss": 0.79146767, + "num_input_tokens_seen": 125440115, + "step": 5828, + "time_per_iteration": 2.8224077224731445 + }, + { + "auxiliary_loss_clip": 0.01126477, + "auxiliary_loss_mlp": 0.01084579, + "balance_loss_clip": 1.02535248, + "balance_loss_mlp": 1.00442314, + "epoch": 0.7008958095352612, + "flos": 18149899858560.0, + "grad_norm": 1.6098657065950992, + "language_loss": 0.85280401, + "learning_rate": 8.671768266586228e-07, + "loss": 0.87491453, + "num_input_tokens_seen": 125458240, + "step": 5829, + "time_per_iteration": 2.73580265045166 + }, + { + "auxiliary_loss_clip": 0.01109566, + "auxiliary_loss_mlp": 0.01084113, + "balance_loss_clip": 1.02555299, + "balance_loss_mlp": 1.00390887, + "epoch": 0.7010160524259004, + "flos": 27452307173760.0, + "grad_norm": 1.6031943336308168, + "language_loss": 0.78153944, + "learning_rate": 8.665349445284275e-07, + "loss": 0.80347627, + "num_input_tokens_seen": 125477980, + "step": 5830, + "time_per_iteration": 2.824068307876587 + }, + { + "auxiliary_loss_clip": 0.01101361, + "auxiliary_loss_mlp": 0.01084023, + "balance_loss_clip": 1.02025962, + "balance_loss_mlp": 1.00391448, + "epoch": 0.7011362953165394, + "flos": 23842064125440.0, + "grad_norm": 1.3912970357683832, + "language_loss": 0.81140995, + "learning_rate": 8.658932343439799e-07, + "loss": 0.83326375, + "num_input_tokens_seen": 125497765, + "step": 5831, + "time_per_iteration": 2.7699995040893555 + }, + { + "auxiliary_loss_clip": 0.01135597, + "auxiliary_loss_mlp": 0.01084798, + "balance_loss_clip": 1.02629769, + "balance_loss_mlp": 1.0045464, + "epoch": 0.7012565382071785, + "flos": 24823430582400.0, + "grad_norm": 1.8233742194680054, + "language_loss": 0.77860707, + "learning_rate": 8.65251696202627e-07, + "loss": 0.80081099, + "num_input_tokens_seen": 125514145, + "step": 5832, + "time_per_iteration": 2.635990619659424 + }, + { + "auxiliary_loss_clip": 0.01089833, + "auxiliary_loss_mlp": 0.01083656, + "balance_loss_clip": 1.02339315, + "balance_loss_mlp": 1.00340414, + "epoch": 0.7013767810978175, + "flos": 21397445326080.0, + "grad_norm": 1.9712896881250763, + "language_loss": 0.87679011, + "learning_rate": 8.646103302016896e-07, + "loss": 0.898525, + "num_input_tokens_seen": 125533115, + "step": 5833, + "time_per_iteration": 2.7677829265594482 + }, + { + "auxiliary_loss_clip": 0.01100641, + "auxiliary_loss_mlp": 0.01084175, + "balance_loss_clip": 1.02405643, + "balance_loss_mlp": 1.00382841, + "epoch": 0.7014970239884567, + "flos": 16687150306560.0, + "grad_norm": 1.908500091955797, + "language_loss": 0.88900626, + "learning_rate": 8.639691364384614e-07, + "loss": 0.91085446, + "num_input_tokens_seen": 125550740, + "step": 5834, + "time_per_iteration": 2.76228404045105 + }, + { + "auxiliary_loss_clip": 0.01117073, + "auxiliary_loss_mlp": 0.01085085, + "balance_loss_clip": 1.02409768, + "balance_loss_mlp": 1.00478578, + "epoch": 0.7016172668790958, + "flos": 12568268718720.0, + "grad_norm": 2.2637557920036135, + "language_loss": 0.72798991, + "learning_rate": 8.633281150102136e-07, + "loss": 0.75001144, + "num_input_tokens_seen": 125567590, + "step": 5835, + "time_per_iteration": 3.4744346141815186 + }, + { + "auxiliary_loss_clip": 0.01114387, + "auxiliary_loss_mlp": 0.01083852, + "balance_loss_clip": 1.02348471, + "balance_loss_mlp": 1.00364816, + "epoch": 0.7017375097697348, + "flos": 17452729808640.0, + "grad_norm": 3.8558300237157375, + "language_loss": 0.68160933, + "learning_rate": 8.626872660141855e-07, + "loss": 0.7035917, + "num_input_tokens_seen": 125585500, + "step": 5836, + "time_per_iteration": 2.7591288089752197 + }, + { + "auxiliary_loss_clip": 0.01096271, + "auxiliary_loss_mlp": 0.01084714, + "balance_loss_clip": 1.02263784, + "balance_loss_mlp": 1.00451028, + "epoch": 0.701857752660374, + "flos": 18513028402560.0, + "grad_norm": 1.6096573283345896, + "language_loss": 0.75141501, + "learning_rate": 8.620465895475957e-07, + "loss": 0.77322483, + "num_input_tokens_seen": 125603720, + "step": 5837, + "time_per_iteration": 2.8062915802001953 + }, + { + "auxiliary_loss_clip": 0.01096941, + "auxiliary_loss_mlp": 0.01084619, + "balance_loss_clip": 1.02254725, + "balance_loss_mlp": 1.00446236, + "epoch": 0.701977995551013, + "flos": 24425971614720.0, + "grad_norm": 1.3953429978379044, + "language_loss": 0.75055724, + "learning_rate": 8.614060857076333e-07, + "loss": 0.77237284, + "num_input_tokens_seen": 125624390, + "step": 5838, + "time_per_iteration": 3.7290802001953125 + }, + { + "auxiliary_loss_clip": 0.01117731, + "auxiliary_loss_mlp": 0.01084449, + "balance_loss_clip": 1.02539539, + "balance_loss_mlp": 1.00419712, + "epoch": 0.7020982384416521, + "flos": 23002759958400.0, + "grad_norm": 1.823576047228917, + "language_loss": 0.74668419, + "learning_rate": 8.60765754591462e-07, + "loss": 0.76870596, + "num_input_tokens_seen": 125644085, + "step": 5839, + "time_per_iteration": 3.703117847442627 + }, + { + "auxiliary_loss_clip": 0.01134184, + "auxiliary_loss_mlp": 0.01083257, + "balance_loss_clip": 1.02489614, + "balance_loss_mlp": 1.00310063, + "epoch": 0.7022184813322913, + "flos": 20449080489600.0, + "grad_norm": 1.7012594257590057, + "language_loss": 0.7294749, + "learning_rate": 8.601255962962211e-07, + "loss": 0.75164932, + "num_input_tokens_seen": 125663095, + "step": 5840, + "time_per_iteration": 3.5603389739990234 + }, + { + "auxiliary_loss_clip": 0.01126941, + "auxiliary_loss_mlp": 0.01084789, + "balance_loss_clip": 1.0251646, + "balance_loss_mlp": 1.00439405, + "epoch": 0.7023387242229303, + "flos": 19790514581760.0, + "grad_norm": 2.4051204015034937, + "language_loss": 0.72456694, + "learning_rate": 8.594856109190194e-07, + "loss": 0.74668419, + "num_input_tokens_seen": 125680125, + "step": 5841, + "time_per_iteration": 2.6762280464172363 + }, + { + "auxiliary_loss_clip": 0.0113425, + "auxiliary_loss_mlp": 0.01084027, + "balance_loss_clip": 1.02498913, + "balance_loss_mlp": 1.00382316, + "epoch": 0.7024589671135694, + "flos": 33259278286080.0, + "grad_norm": 1.548302064135206, + "language_loss": 0.68979049, + "learning_rate": 8.588457985569446e-07, + "loss": 0.71197319, + "num_input_tokens_seen": 125703035, + "step": 5842, + "time_per_iteration": 2.7182583808898926 + }, + { + "auxiliary_loss_clip": 0.01134941, + "auxiliary_loss_mlp": 0.01084452, + "balance_loss_clip": 1.02525377, + "balance_loss_mlp": 1.00429618, + "epoch": 0.7025792100042085, + "flos": 19098982967040.0, + "grad_norm": 2.4415907896491627, + "language_loss": 0.716717, + "learning_rate": 8.582061593070542e-07, + "loss": 0.73891091, + "num_input_tokens_seen": 125723765, + "step": 5843, + "time_per_iteration": 2.759436845779419 + }, + { + "auxiliary_loss_clip": 0.01134436, + "auxiliary_loss_mlp": 0.00872955, + "balance_loss_clip": 1.02502155, + "balance_loss_mlp": 1.0000999, + "epoch": 0.7026994528948476, + "flos": 18952611045120.0, + "grad_norm": 1.927062050657624, + "language_loss": 0.76476109, + "learning_rate": 8.57566693266383e-07, + "loss": 0.78483498, + "num_input_tokens_seen": 125741455, + "step": 5844, + "time_per_iteration": 2.629094123840332 + }, + { + "auxiliary_loss_clip": 0.01116981, + "auxiliary_loss_mlp": 0.00872933, + "balance_loss_clip": 1.0236367, + "balance_loss_mlp": 1.00010133, + "epoch": 0.7028196957854866, + "flos": 19536662188800.0, + "grad_norm": 1.8289682404127248, + "language_loss": 0.69083464, + "learning_rate": 8.569274005319354e-07, + "loss": 0.71073377, + "num_input_tokens_seen": 125759855, + "step": 5845, + "time_per_iteration": 2.6614723205566406 + }, + { + "auxiliary_loss_clip": 0.01128316, + "auxiliary_loss_mlp": 0.0108497, + "balance_loss_clip": 1.02599192, + "balance_loss_mlp": 1.00476623, + "epoch": 0.7029399386761258, + "flos": 20845318394880.0, + "grad_norm": 1.6465144263992069, + "language_loss": 0.79660332, + "learning_rate": 8.562882812006913e-07, + "loss": 0.8187362, + "num_input_tokens_seen": 125777345, + "step": 5846, + "time_per_iteration": 2.6638619899749756 + }, + { + "auxiliary_loss_clip": 0.01135027, + "auxiliary_loss_mlp": 0.01085684, + "balance_loss_clip": 1.02554107, + "balance_loss_mlp": 1.00557494, + "epoch": 0.7030601815667649, + "flos": 22055005653120.0, + "grad_norm": 1.591116278265422, + "language_loss": 0.77646112, + "learning_rate": 8.556493353696066e-07, + "loss": 0.79866827, + "num_input_tokens_seen": 125796345, + "step": 5847, + "time_per_iteration": 2.6390013694763184 + }, + { + "auxiliary_loss_clip": 0.01126315, + "auxiliary_loss_mlp": 0.00873048, + "balance_loss_clip": 1.02540755, + "balance_loss_mlp": 1.00012541, + "epoch": 0.7031804244574039, + "flos": 27198742089600.0, + "grad_norm": 2.115432003656086, + "language_loss": 0.67991734, + "learning_rate": 8.550105631356077e-07, + "loss": 0.699911, + "num_input_tokens_seen": 125816070, + "step": 5848, + "time_per_iteration": 2.7369301319122314 + }, + { + "auxiliary_loss_clip": 0.01102724, + "auxiliary_loss_mlp": 0.01084478, + "balance_loss_clip": 1.02437973, + "balance_loss_mlp": 1.00417852, + "epoch": 0.7033006673480431, + "flos": 22379853277440.0, + "grad_norm": 2.5127712780998173, + "language_loss": 0.77418357, + "learning_rate": 8.543719645955961e-07, + "loss": 0.79605561, + "num_input_tokens_seen": 125834400, + "step": 5849, + "time_per_iteration": 2.773268461227417 + }, + { + "auxiliary_loss_clip": 0.01115586, + "auxiliary_loss_mlp": 0.01084556, + "balance_loss_clip": 1.02317715, + "balance_loss_mlp": 1.0043993, + "epoch": 0.7034209102386821, + "flos": 24715986024960.0, + "grad_norm": 1.5226204180145437, + "language_loss": 0.74353313, + "learning_rate": 8.537335398464467e-07, + "loss": 0.76553452, + "num_input_tokens_seen": 125854720, + "step": 5850, + "time_per_iteration": 2.8273980617523193 + }, + { + "auxiliary_loss_clip": 0.01118117, + "auxiliary_loss_mlp": 0.01085175, + "balance_loss_clip": 1.02425742, + "balance_loss_mlp": 1.00487578, + "epoch": 0.7035411531293212, + "flos": 22556174163840.0, + "grad_norm": 2.6207994867648874, + "language_loss": 0.84759831, + "learning_rate": 8.53095288985007e-07, + "loss": 0.86963123, + "num_input_tokens_seen": 125868455, + "step": 5851, + "time_per_iteration": 2.699228048324585 + }, + { + "auxiliary_loss_clip": 0.01134561, + "auxiliary_loss_mlp": 0.01085199, + "balance_loss_clip": 1.02517557, + "balance_loss_mlp": 1.00499475, + "epoch": 0.7036613960199604, + "flos": 22674967418880.0, + "grad_norm": 1.6004808549806007, + "language_loss": 0.82180285, + "learning_rate": 8.524572121081009e-07, + "loss": 0.84400046, + "num_input_tokens_seen": 125888555, + "step": 5852, + "time_per_iteration": 2.654723882675171 + }, + { + "auxiliary_loss_clip": 0.01127206, + "auxiliary_loss_mlp": 0.01084141, + "balance_loss_clip": 1.02595866, + "balance_loss_mlp": 1.00393653, + "epoch": 0.7037816389105994, + "flos": 22492146170880.0, + "grad_norm": 1.842557898371256, + "language_loss": 0.62633961, + "learning_rate": 8.518193093125232e-07, + "loss": 0.64845312, + "num_input_tokens_seen": 125907610, + "step": 5853, + "time_per_iteration": 2.6584625244140625 + }, + { + "auxiliary_loss_clip": 0.01116252, + "auxiliary_loss_mlp": 0.01084102, + "balance_loss_clip": 1.02318728, + "balance_loss_mlp": 1.00394607, + "epoch": 0.7039018818012385, + "flos": 27087490690560.0, + "grad_norm": 1.6617033667207648, + "language_loss": 0.81042182, + "learning_rate": 8.511815806950436e-07, + "loss": 0.83242536, + "num_input_tokens_seen": 125928640, + "step": 5854, + "time_per_iteration": 2.77860426902771 + }, + { + "auxiliary_loss_clip": 0.01124045, + "auxiliary_loss_mlp": 0.01084341, + "balance_loss_clip": 1.02322221, + "balance_loss_mlp": 1.00404167, + "epoch": 0.7040221246918776, + "flos": 17749819198080.0, + "grad_norm": 1.5768812812653736, + "language_loss": 0.7813071, + "learning_rate": 8.505440263524044e-07, + "loss": 0.80339098, + "num_input_tokens_seen": 125947485, + "step": 5855, + "time_per_iteration": 2.625088691711426 + }, + { + "auxiliary_loss_clip": 0.01127198, + "auxiliary_loss_mlp": 0.01084697, + "balance_loss_clip": 1.02538991, + "balance_loss_mlp": 1.00439787, + "epoch": 0.7041423675825167, + "flos": 16279851012480.0, + "grad_norm": 2.2498521068794055, + "language_loss": 0.88131058, + "learning_rate": 8.49906646381322e-07, + "loss": 0.90342951, + "num_input_tokens_seen": 125960320, + "step": 5856, + "time_per_iteration": 2.701362371444702 + }, + { + "auxiliary_loss_clip": 0.01091885, + "auxiliary_loss_mlp": 0.01083744, + "balance_loss_clip": 1.02469909, + "balance_loss_mlp": 1.00358772, + "epoch": 0.7042626104731557, + "flos": 25483181639040.0, + "grad_norm": 1.7488764968328128, + "language_loss": 0.72467536, + "learning_rate": 8.492694408784884e-07, + "loss": 0.74643165, + "num_input_tokens_seen": 125980575, + "step": 5857, + "time_per_iteration": 2.793513059616089 + }, + { + "auxiliary_loss_clip": 0.0112717, + "auxiliary_loss_mlp": 0.0108469, + "balance_loss_clip": 1.02533245, + "balance_loss_mlp": 1.00453377, + "epoch": 0.7043828533637949, + "flos": 17857622891520.0, + "grad_norm": 2.460699464853316, + "language_loss": 0.62028223, + "learning_rate": 8.486324099405642e-07, + "loss": 0.64240086, + "num_input_tokens_seen": 125997420, + "step": 5858, + "time_per_iteration": 2.6449172496795654 + }, + { + "auxiliary_loss_clip": 0.01123564, + "auxiliary_loss_mlp": 0.01083491, + "balance_loss_clip": 1.02341485, + "balance_loss_mlp": 1.00338244, + "epoch": 0.704503096254434, + "flos": 29494259533440.0, + "grad_norm": 2.0672208207845553, + "language_loss": 0.74750823, + "learning_rate": 8.479955536641887e-07, + "loss": 0.76957881, + "num_input_tokens_seen": 126018915, + "step": 5859, + "time_per_iteration": 2.776784896850586 + }, + { + "auxiliary_loss_clip": 0.01117462, + "auxiliary_loss_mlp": 0.0108398, + "balance_loss_clip": 1.02387428, + "balance_loss_mlp": 1.00382364, + "epoch": 0.704623339145073, + "flos": 30920739327360.0, + "grad_norm": 1.96718583529298, + "language_loss": 0.6623342, + "learning_rate": 8.473588721459716e-07, + "loss": 0.68434864, + "num_input_tokens_seen": 126038825, + "step": 5860, + "time_per_iteration": 2.75281023979187 + }, + { + "auxiliary_loss_clip": 0.01126202, + "auxiliary_loss_mlp": 0.01085129, + "balance_loss_clip": 1.02611971, + "balance_loss_mlp": 1.00478148, + "epoch": 0.7047435820357122, + "flos": 23914747296000.0, + "grad_norm": 1.9255359627563882, + "language_loss": 0.70342124, + "learning_rate": 8.467223654824967e-07, + "loss": 0.72553444, + "num_input_tokens_seen": 126058280, + "step": 5861, + "time_per_iteration": 3.6946818828582764 + }, + { + "auxiliary_loss_clip": 0.01127812, + "auxiliary_loss_mlp": 0.01084783, + "balance_loss_clip": 1.02611947, + "balance_loss_mlp": 1.00453091, + "epoch": 0.7048638249263512, + "flos": 46494010926720.0, + "grad_norm": 1.9020585058651358, + "language_loss": 0.62305504, + "learning_rate": 8.460860337703233e-07, + "loss": 0.64518094, + "num_input_tokens_seen": 126078885, + "step": 5862, + "time_per_iteration": 2.939857244491577 + }, + { + "auxiliary_loss_clip": 0.01104529, + "auxiliary_loss_mlp": 0.01083605, + "balance_loss_clip": 1.02098393, + "balance_loss_mlp": 1.00330591, + "epoch": 0.7049840678169903, + "flos": 21689219502720.0, + "grad_norm": 1.7608905662437928, + "language_loss": 0.70561355, + "learning_rate": 8.454498771059797e-07, + "loss": 0.7274949, + "num_input_tokens_seen": 126098260, + "step": 5863, + "time_per_iteration": 2.7441794872283936 + }, + { + "auxiliary_loss_clip": 0.01091066, + "auxiliary_loss_mlp": 0.01086051, + "balance_loss_clip": 1.02194786, + "balance_loss_mlp": 1.00579953, + "epoch": 0.7051043107076294, + "flos": 18405081054720.0, + "grad_norm": 1.8895407774429054, + "language_loss": 0.8321836, + "learning_rate": 8.448138955859725e-07, + "loss": 0.85395479, + "num_input_tokens_seen": 126114845, + "step": 5864, + "time_per_iteration": 4.719519138336182 + }, + { + "auxiliary_loss_clip": 0.01115643, + "auxiliary_loss_mlp": 0.01084564, + "balance_loss_clip": 1.02396584, + "balance_loss_mlp": 1.00435948, + "epoch": 0.7052245535982685, + "flos": 19319043640320.0, + "grad_norm": 1.8444464385015813, + "language_loss": 0.89642286, + "learning_rate": 8.44178089306778e-07, + "loss": 0.91842496, + "num_input_tokens_seen": 126132780, + "step": 5865, + "time_per_iteration": 3.684391736984253 + }, + { + "auxiliary_loss_clip": 0.01135781, + "auxiliary_loss_mlp": 0.01085305, + "balance_loss_clip": 1.02667487, + "balance_loss_mlp": 1.00514889, + "epoch": 0.7053447964889076, + "flos": 19062138591360.0, + "grad_norm": 1.913354801314634, + "language_loss": 0.77013695, + "learning_rate": 8.4354245836485e-07, + "loss": 0.79234785, + "num_input_tokens_seen": 126151225, + "step": 5866, + "time_per_iteration": 2.6531522274017334 + }, + { + "auxiliary_loss_clip": 0.01100682, + "auxiliary_loss_mlp": 0.01084396, + "balance_loss_clip": 1.02340531, + "balance_loss_mlp": 1.00409663, + "epoch": 0.7054650393795466, + "flos": 27379228953600.0, + "grad_norm": 1.5045321768956939, + "language_loss": 0.72814381, + "learning_rate": 8.429070028566108e-07, + "loss": 0.74999464, + "num_input_tokens_seen": 126172535, + "step": 5867, + "time_per_iteration": 2.8034050464630127 + }, + { + "auxiliary_loss_clip": 0.01125722, + "auxiliary_loss_mlp": 0.01085238, + "balance_loss_clip": 1.02545834, + "balance_loss_mlp": 1.00508189, + "epoch": 0.7055852822701858, + "flos": 16102201322880.0, + "grad_norm": 2.0005284674694197, + "language_loss": 0.74765247, + "learning_rate": 8.422717228784586e-07, + "loss": 0.7697621, + "num_input_tokens_seen": 126189410, + "step": 5868, + "time_per_iteration": 2.636453151702881 + }, + { + "auxiliary_loss_clip": 0.01089633, + "auxiliary_loss_mlp": 0.01085023, + "balance_loss_clip": 1.02200592, + "balance_loss_mlp": 1.00486696, + "epoch": 0.7057055251608249, + "flos": 11692299744000.0, + "grad_norm": 1.7016644051086782, + "language_loss": 0.6930846, + "learning_rate": 8.416366185267663e-07, + "loss": 0.71483117, + "num_input_tokens_seen": 126206910, + "step": 5869, + "time_per_iteration": 2.8409783840179443 + }, + { + "auxiliary_loss_clip": 0.01126107, + "auxiliary_loss_mlp": 0.0108403, + "balance_loss_clip": 1.02475476, + "balance_loss_mlp": 1.00387359, + "epoch": 0.7058257680514639, + "flos": 22711560399360.0, + "grad_norm": 1.6806619938993899, + "language_loss": 0.77968526, + "learning_rate": 8.410016898978778e-07, + "loss": 0.80178666, + "num_input_tokens_seen": 126224385, + "step": 5870, + "time_per_iteration": 2.6293342113494873 + }, + { + "auxiliary_loss_clip": 0.01075358, + "auxiliary_loss_mlp": 0.01084483, + "balance_loss_clip": 1.02002871, + "balance_loss_mlp": 1.00437415, + "epoch": 0.7059460109421031, + "flos": 17529543043200.0, + "grad_norm": 1.6399018154117808, + "language_loss": 0.78757638, + "learning_rate": 8.403669370881115e-07, + "loss": 0.80917484, + "num_input_tokens_seen": 126243120, + "step": 5871, + "time_per_iteration": 2.803175687789917 + }, + { + "auxiliary_loss_clip": 0.01136501, + "auxiliary_loss_mlp": 0.01086045, + "balance_loss_clip": 1.0267657, + "balance_loss_mlp": 1.0058409, + "epoch": 0.7060662538327421, + "flos": 23544687427200.0, + "grad_norm": 1.6792218428487216, + "language_loss": 0.78510249, + "learning_rate": 8.397323601937587e-07, + "loss": 0.80732799, + "num_input_tokens_seen": 126263020, + "step": 5872, + "time_per_iteration": 2.741696357727051 + }, + { + "auxiliary_loss_clip": 0.01106116, + "auxiliary_loss_mlp": 0.01084877, + "balance_loss_clip": 1.02341557, + "balance_loss_mlp": 1.0045774, + "epoch": 0.7061864967233812, + "flos": 30260736875520.0, + "grad_norm": 2.3972326825299763, + "language_loss": 0.77007848, + "learning_rate": 8.390979593110838e-07, + "loss": 0.79198837, + "num_input_tokens_seen": 126285150, + "step": 5873, + "time_per_iteration": 2.770794630050659 + }, + { + "auxiliary_loss_clip": 0.01116734, + "auxiliary_loss_mlp": 0.01084812, + "balance_loss_clip": 1.02463412, + "balance_loss_mlp": 1.00451291, + "epoch": 0.7063067396140204, + "flos": 20701460424960.0, + "grad_norm": 1.5947908778514674, + "language_loss": 0.81601059, + "learning_rate": 8.384637345363262e-07, + "loss": 0.83802617, + "num_input_tokens_seen": 126304340, + "step": 5874, + "time_per_iteration": 2.7482008934020996 + }, + { + "auxiliary_loss_clip": 0.01118255, + "auxiliary_loss_mlp": 0.01084808, + "balance_loss_clip": 1.02474201, + "balance_loss_mlp": 1.00469923, + "epoch": 0.7064269825046594, + "flos": 32266168081920.0, + "grad_norm": 1.7586386313532298, + "language_loss": 0.76737237, + "learning_rate": 8.378296859656964e-07, + "loss": 0.78940296, + "num_input_tokens_seen": 126325495, + "step": 5875, + "time_per_iteration": 2.8631935119628906 + }, + { + "auxiliary_loss_clip": 0.01117292, + "auxiliary_loss_mlp": 0.0108461, + "balance_loss_clip": 1.02531993, + "balance_loss_mlp": 1.00450134, + "epoch": 0.7065472253952985, + "flos": 30227124723840.0, + "grad_norm": 2.222017799108955, + "language_loss": 0.67999178, + "learning_rate": 8.371958136953792e-07, + "loss": 0.70201087, + "num_input_tokens_seen": 126345525, + "step": 5876, + "time_per_iteration": 2.832618474960327 + }, + { + "auxiliary_loss_clip": 0.01107758, + "auxiliary_loss_mlp": 0.01084593, + "balance_loss_clip": 1.02376008, + "balance_loss_mlp": 1.00424612, + "epoch": 0.7066674682859376, + "flos": 16216720859520.0, + "grad_norm": 2.368200734501113, + "language_loss": 0.66419899, + "learning_rate": 8.365621178215326e-07, + "loss": 0.68612254, + "num_input_tokens_seen": 126361995, + "step": 5877, + "time_per_iteration": 2.7512593269348145 + }, + { + "auxiliary_loss_clip": 0.01126296, + "auxiliary_loss_mlp": 0.01084919, + "balance_loss_clip": 1.02476895, + "balance_loss_mlp": 1.00476277, + "epoch": 0.7067877111765767, + "flos": 14830461319680.0, + "grad_norm": 2.2301804396203995, + "language_loss": 0.75176179, + "learning_rate": 8.359285984402871e-07, + "loss": 0.77387393, + "num_input_tokens_seen": 126379260, + "step": 5878, + "time_per_iteration": 2.6919057369232178 + }, + { + "auxiliary_loss_clip": 0.01110477, + "auxiliary_loss_mlp": 0.01083434, + "balance_loss_clip": 1.02473366, + "balance_loss_mlp": 1.00337291, + "epoch": 0.7069079540672157, + "flos": 25440196037760.0, + "grad_norm": 1.9035159606745908, + "language_loss": 0.74021769, + "learning_rate": 8.352952556477489e-07, + "loss": 0.76215678, + "num_input_tokens_seen": 126397170, + "step": 5879, + "time_per_iteration": 2.710097551345825 + }, + { + "auxiliary_loss_clip": 0.01124819, + "auxiliary_loss_mlp": 0.0108365, + "balance_loss_clip": 1.02473438, + "balance_loss_mlp": 1.00344563, + "epoch": 0.7070281969578549, + "flos": 24607751368320.0, + "grad_norm": 1.7372785984255112, + "language_loss": 0.76517445, + "learning_rate": 8.34662089539993e-07, + "loss": 0.7872591, + "num_input_tokens_seen": 126416680, + "step": 5880, + "time_per_iteration": 2.71121883392334 + }, + { + "auxiliary_loss_clip": 0.01134423, + "auxiliary_loss_mlp": 0.01084368, + "balance_loss_clip": 1.02554095, + "balance_loss_mlp": 1.00421214, + "epoch": 0.707148439848494, + "flos": 26724469887360.0, + "grad_norm": 2.1712842941745545, + "language_loss": 0.79413235, + "learning_rate": 8.340291002130722e-07, + "loss": 0.8163203, + "num_input_tokens_seen": 126435870, + "step": 5881, + "time_per_iteration": 2.686711549758911 + }, + { + "auxiliary_loss_clip": 0.01135432, + "auxiliary_loss_mlp": 0.01084978, + "balance_loss_clip": 1.02604389, + "balance_loss_mlp": 1.00472665, + "epoch": 0.707268682739133, + "flos": 15085750256640.0, + "grad_norm": 2.455349464424396, + "language_loss": 0.79710305, + "learning_rate": 8.3339628776301e-07, + "loss": 0.81930709, + "num_input_tokens_seen": 126454010, + "step": 5882, + "time_per_iteration": 2.6313889026641846 + }, + { + "auxiliary_loss_clip": 0.01134795, + "auxiliary_loss_mlp": 0.01083844, + "balance_loss_clip": 1.02510417, + "balance_loss_mlp": 1.00363982, + "epoch": 0.7073889256297722, + "flos": 34313148345600.0, + "grad_norm": 2.057005594053132, + "language_loss": 0.5710125, + "learning_rate": 8.327636522858033e-07, + "loss": 0.5931989, + "num_input_tokens_seen": 126473615, + "step": 5883, + "time_per_iteration": 2.764063596725464 + }, + { + "auxiliary_loss_clip": 0.01081113, + "auxiliary_loss_mlp": 0.01084056, + "balance_loss_clip": 1.02457428, + "balance_loss_mlp": 1.00389969, + "epoch": 0.7075091685204112, + "flos": 20083940784000.0, + "grad_norm": 1.806700543385748, + "language_loss": 0.76584268, + "learning_rate": 8.321311938774225e-07, + "loss": 0.7874943, + "num_input_tokens_seen": 126492705, + "step": 5884, + "time_per_iteration": 2.949972629547119 + }, + { + "auxiliary_loss_clip": 0.01135021, + "auxiliary_loss_mlp": 0.01084366, + "balance_loss_clip": 1.0252248, + "balance_loss_mlp": 1.00416207, + "epoch": 0.7076294114110503, + "flos": 20777124424320.0, + "grad_norm": 1.9238677224020253, + "language_loss": 0.79490083, + "learning_rate": 8.314989126338104e-07, + "loss": 0.81709468, + "num_input_tokens_seen": 126512715, + "step": 5885, + "time_per_iteration": 2.7249934673309326 + }, + { + "auxiliary_loss_clip": 0.01126422, + "auxiliary_loss_mlp": 0.01083299, + "balance_loss_clip": 1.0252068, + "balance_loss_mlp": 1.00314283, + "epoch": 0.7077496543016895, + "flos": 17967689141760.0, + "grad_norm": 1.6255063290222245, + "language_loss": 0.8417908, + "learning_rate": 8.308668086508847e-07, + "loss": 0.86388797, + "num_input_tokens_seen": 126530795, + "step": 5886, + "time_per_iteration": 2.6396286487579346 + }, + { + "auxiliary_loss_clip": 0.01108441, + "auxiliary_loss_mlp": 0.0108333, + "balance_loss_clip": 1.02356815, + "balance_loss_mlp": 1.00322127, + "epoch": 0.7078698971923285, + "flos": 45478098564480.0, + "grad_norm": 1.662375769223164, + "language_loss": 0.73730701, + "learning_rate": 8.302348820245342e-07, + "loss": 0.75922471, + "num_input_tokens_seen": 126553360, + "step": 5887, + "time_per_iteration": 3.911425828933716 + }, + { + "auxiliary_loss_clip": 0.0109224, + "auxiliary_loss_mlp": 0.01084152, + "balance_loss_clip": 1.02489269, + "balance_loss_mlp": 1.00385249, + "epoch": 0.7079901400829676, + "flos": 26943704547840.0, + "grad_norm": 2.6200189268479055, + "language_loss": 0.70524335, + "learning_rate": 8.296031328506232e-07, + "loss": 0.72700727, + "num_input_tokens_seen": 126573110, + "step": 5888, + "time_per_iteration": 2.7909321784973145 + }, + { + "auxiliary_loss_clip": 0.01117893, + "auxiliary_loss_mlp": 0.01084165, + "balance_loss_clip": 1.02594924, + "balance_loss_mlp": 1.00405622, + "epoch": 0.7081103829736067, + "flos": 24423206267520.0, + "grad_norm": 1.8656435247591288, + "language_loss": 0.75108278, + "learning_rate": 8.289715612249857e-07, + "loss": 0.77310336, + "num_input_tokens_seen": 126593725, + "step": 5889, + "time_per_iteration": 3.7299962043762207 + }, + { + "auxiliary_loss_clip": 0.01117223, + "auxiliary_loss_mlp": 0.01084579, + "balance_loss_clip": 1.02468824, + "balance_loss_mlp": 1.00432718, + "epoch": 0.7082306258642458, + "flos": 18543300589440.0, + "grad_norm": 13.008441661263847, + "language_loss": 0.77247125, + "learning_rate": 8.283401672434305e-07, + "loss": 0.79448926, + "num_input_tokens_seen": 126608950, + "step": 5890, + "time_per_iteration": 2.7497589588165283 + }, + { + "auxiliary_loss_clip": 0.0111455, + "auxiliary_loss_mlp": 0.01083762, + "balance_loss_clip": 1.02422142, + "balance_loss_mlp": 1.00355768, + "epoch": 0.7083508687548848, + "flos": 23477534951040.0, + "grad_norm": 2.012978591497955, + "language_loss": 0.70322454, + "learning_rate": 8.277089510017412e-07, + "loss": 0.72520769, + "num_input_tokens_seen": 126629755, + "step": 5891, + "time_per_iteration": 3.7253456115722656 + }, + { + "auxiliary_loss_clip": 0.01114438, + "auxiliary_loss_mlp": 0.01083725, + "balance_loss_clip": 1.02375257, + "balance_loss_mlp": 1.00361669, + "epoch": 0.708471111645524, + "flos": 22419463000320.0, + "grad_norm": 1.5891417253145315, + "language_loss": 0.81951785, + "learning_rate": 8.270779125956719e-07, + "loss": 0.84149951, + "num_input_tokens_seen": 126650135, + "step": 5892, + "time_per_iteration": 2.6811535358428955 + }, + { + "auxiliary_loss_clip": 0.01096832, + "auxiliary_loss_mlp": 0.01084657, + "balance_loss_clip": 1.02235353, + "balance_loss_mlp": 1.00445259, + "epoch": 0.7085913545361631, + "flos": 20922885815040.0, + "grad_norm": 2.159062966857959, + "language_loss": 0.80025303, + "learning_rate": 8.264470521209505e-07, + "loss": 0.82206798, + "num_input_tokens_seen": 126668500, + "step": 5893, + "time_per_iteration": 2.8247740268707275 + }, + { + "auxiliary_loss_clip": 0.0112592, + "auxiliary_loss_mlp": 0.01083676, + "balance_loss_clip": 1.02423263, + "balance_loss_mlp": 1.00351989, + "epoch": 0.7087115974268021, + "flos": 15012384727680.0, + "grad_norm": 2.014826278748687, + "language_loss": 0.76345384, + "learning_rate": 8.258163696732785e-07, + "loss": 0.78554976, + "num_input_tokens_seen": 126686090, + "step": 5894, + "time_per_iteration": 2.6801464557647705 + }, + { + "auxiliary_loss_clip": 0.01126474, + "auxiliary_loss_mlp": 0.01083535, + "balance_loss_clip": 1.02514338, + "balance_loss_mlp": 1.00342596, + "epoch": 0.7088318403174413, + "flos": 21539040739200.0, + "grad_norm": 1.8330028996151055, + "language_loss": 0.76882136, + "learning_rate": 8.251858653483288e-07, + "loss": 0.79092145, + "num_input_tokens_seen": 126704255, + "step": 5895, + "time_per_iteration": 2.72653865814209 + }, + { + "auxiliary_loss_clip": 0.01126001, + "auxiliary_loss_mlp": 0.01084252, + "balance_loss_clip": 1.02582765, + "balance_loss_mlp": 1.00409555, + "epoch": 0.7089520832080803, + "flos": 15516785462400.0, + "grad_norm": 2.056079451233854, + "language_loss": 0.85702848, + "learning_rate": 8.245555392417501e-07, + "loss": 0.87913096, + "num_input_tokens_seen": 126718910, + "step": 5896, + "time_per_iteration": 2.6444952487945557 + }, + { + "auxiliary_loss_clip": 0.010981, + "auxiliary_loss_mlp": 0.010848, + "balance_loss_clip": 1.02247453, + "balance_loss_mlp": 1.00469136, + "epoch": 0.7090723260987194, + "flos": 20412667077120.0, + "grad_norm": 2.2789403833031603, + "language_loss": 0.79038954, + "learning_rate": 8.239253914491613e-07, + "loss": 0.81221855, + "num_input_tokens_seen": 126737235, + "step": 5897, + "time_per_iteration": 2.8101282119750977 + }, + { + "auxiliary_loss_clip": 0.01106971, + "auxiliary_loss_mlp": 0.01084391, + "balance_loss_clip": 1.02377224, + "balance_loss_mlp": 1.00409162, + "epoch": 0.7091925689893585, + "flos": 25668337271040.0, + "grad_norm": 1.7292789443136138, + "language_loss": 0.75093222, + "learning_rate": 8.232954220661556e-07, + "loss": 0.77284586, + "num_input_tokens_seen": 126759970, + "step": 5898, + "time_per_iteration": 2.78558087348938 + }, + { + "auxiliary_loss_clip": 0.01136459, + "auxiliary_loss_mlp": 0.01084795, + "balance_loss_clip": 1.02745414, + "balance_loss_mlp": 1.00463867, + "epoch": 0.7093128118799976, + "flos": 24206629213440.0, + "grad_norm": 2.412165874023238, + "language_loss": 0.70299464, + "learning_rate": 8.226656311882989e-07, + "loss": 0.72520721, + "num_input_tokens_seen": 126779280, + "step": 5899, + "time_per_iteration": 2.6641905307769775 + }, + { + "auxiliary_loss_clip": 0.01124352, + "auxiliary_loss_mlp": 0.01084854, + "balance_loss_clip": 1.02503049, + "balance_loss_mlp": 1.00474501, + "epoch": 0.7094330547706367, + "flos": 16646786398080.0, + "grad_norm": 2.1192921727577154, + "language_loss": 0.77032673, + "learning_rate": 8.22036018911129e-07, + "loss": 0.79241878, + "num_input_tokens_seen": 126797310, + "step": 5900, + "time_per_iteration": 2.636674165725708 + }, + { + "auxiliary_loss_clip": 0.01134414, + "auxiliary_loss_mlp": 0.01084973, + "balance_loss_clip": 1.02453625, + "balance_loss_mlp": 1.00467396, + "epoch": 0.7095532976612757, + "flos": 16283370545280.0, + "grad_norm": 8.430703787852277, + "language_loss": 0.80354273, + "learning_rate": 8.214065853301599e-07, + "loss": 0.82573664, + "num_input_tokens_seen": 126812840, + "step": 5901, + "time_per_iteration": 2.5786283016204834 + }, + { + "auxiliary_loss_clip": 0.0110727, + "auxiliary_loss_mlp": 0.01078961, + "balance_loss_clip": 1.01960564, + "balance_loss_mlp": 0.99999666, + "epoch": 0.7096735405519149, + "flos": 70722080559360.0, + "grad_norm": 0.8195230132432566, + "language_loss": 0.58281577, + "learning_rate": 8.207773305408734e-07, + "loss": 0.60467803, + "num_input_tokens_seen": 126880060, + "step": 5902, + "time_per_iteration": 3.361598253250122 + }, + { + "auxiliary_loss_clip": 0.01098743, + "auxiliary_loss_mlp": 0.01084678, + "balance_loss_clip": 1.02222013, + "balance_loss_mlp": 1.00442636, + "epoch": 0.709793783442554, + "flos": 23621500661760.0, + "grad_norm": 7.9594093466550175, + "language_loss": 0.80185091, + "learning_rate": 8.201482546387288e-07, + "loss": 0.82368517, + "num_input_tokens_seen": 126899535, + "step": 5903, + "time_per_iteration": 2.8124449253082275 + }, + { + "auxiliary_loss_clip": 0.01125049, + "auxiliary_loss_mlp": 0.01085125, + "balance_loss_clip": 1.02484035, + "balance_loss_mlp": 1.00492096, + "epoch": 0.709914026333193, + "flos": 25993472204160.0, + "grad_norm": 1.6557988162817412, + "language_loss": 0.91915452, + "learning_rate": 8.195193577191553e-07, + "loss": 0.94125623, + "num_input_tokens_seen": 126921365, + "step": 5904, + "time_per_iteration": 2.7416274547576904 + }, + { + "auxiliary_loss_clip": 0.01102507, + "auxiliary_loss_mlp": 0.00872864, + "balance_loss_clip": 1.02559042, + "balance_loss_mlp": 1.00011849, + "epoch": 0.7100342692238322, + "flos": 24861531934080.0, + "grad_norm": 1.7579123133194643, + "language_loss": 0.84581083, + "learning_rate": 8.188906398775579e-07, + "loss": 0.86556453, + "num_input_tokens_seen": 126941910, + "step": 5905, + "time_per_iteration": 2.7722768783569336 + }, + { + "auxiliary_loss_clip": 0.01134809, + "auxiliary_loss_mlp": 0.00872898, + "balance_loss_clip": 1.02505922, + "balance_loss_mlp": 1.00012612, + "epoch": 0.7101545121144712, + "flos": 24932203943040.0, + "grad_norm": 1.7174852259297515, + "language_loss": 0.68546772, + "learning_rate": 8.18262101209311e-07, + "loss": 0.70554471, + "num_input_tokens_seen": 126961120, + "step": 5906, + "time_per_iteration": 2.7044858932495117 + }, + { + "auxiliary_loss_clip": 0.01110115, + "auxiliary_loss_mlp": 0.01084743, + "balance_loss_clip": 1.02548432, + "balance_loss_mlp": 1.00458682, + "epoch": 0.7102747550051103, + "flos": 23768842250880.0, + "grad_norm": 1.7063218931181987, + "language_loss": 0.7021597, + "learning_rate": 8.176337418097626e-07, + "loss": 0.72410828, + "num_input_tokens_seen": 126981590, + "step": 5907, + "time_per_iteration": 2.784073829650879 + }, + { + "auxiliary_loss_clip": 0.01124764, + "auxiliary_loss_mlp": 0.00872847, + "balance_loss_clip": 1.02458751, + "balance_loss_mlp": 1.00011826, + "epoch": 0.7103949978957494, + "flos": 15303907509120.0, + "grad_norm": 1.9813867811965753, + "language_loss": 0.79742253, + "learning_rate": 8.170055617742364e-07, + "loss": 0.81739861, + "num_input_tokens_seen": 126998870, + "step": 5908, + "time_per_iteration": 2.7156949043273926 + }, + { + "auxiliary_loss_clip": 0.01116112, + "auxiliary_loss_mlp": 0.01084035, + "balance_loss_clip": 1.02427077, + "balance_loss_mlp": 1.00387883, + "epoch": 0.7105152407863885, + "flos": 22638805401600.0, + "grad_norm": 1.760786508047143, + "language_loss": 0.70663428, + "learning_rate": 8.163775611980252e-07, + "loss": 0.72863579, + "num_input_tokens_seen": 127017980, + "step": 5909, + "time_per_iteration": 2.731300115585327 + }, + { + "auxiliary_loss_clip": 0.0111511, + "auxiliary_loss_mlp": 0.01083647, + "balance_loss_clip": 1.0235709, + "balance_loss_mlp": 1.00344276, + "epoch": 0.7106354836770276, + "flos": 17238594879360.0, + "grad_norm": 2.1685818370702954, + "language_loss": 0.78581947, + "learning_rate": 8.157497401763982e-07, + "loss": 0.80780697, + "num_input_tokens_seen": 127035645, + "step": 5910, + "time_per_iteration": 2.6949172019958496 + }, + { + "auxiliary_loss_clip": 0.01123942, + "auxiliary_loss_mlp": 0.0108413, + "balance_loss_clip": 1.02381229, + "balance_loss_mlp": 1.0039742, + "epoch": 0.7107557265676667, + "flos": 20193647898240.0, + "grad_norm": 2.119633492329892, + "language_loss": 0.77569944, + "learning_rate": 8.151220988045935e-07, + "loss": 0.79778016, + "num_input_tokens_seen": 127054900, + "step": 5911, + "time_per_iteration": 2.7085728645324707 + }, + { + "auxiliary_loss_clip": 0.01125473, + "auxiliary_loss_mlp": 0.0108471, + "balance_loss_clip": 1.02498496, + "balance_loss_mlp": 1.00455403, + "epoch": 0.7108759694583058, + "flos": 21507080613120.0, + "grad_norm": 1.620406295299517, + "language_loss": 0.82905185, + "learning_rate": 8.144946371778234e-07, + "loss": 0.85115373, + "num_input_tokens_seen": 127075010, + "step": 5912, + "time_per_iteration": 3.644219398498535 + }, + { + "auxiliary_loss_clip": 0.0111466, + "auxiliary_loss_mlp": 0.00872961, + "balance_loss_clip": 1.02375674, + "balance_loss_mlp": 1.00011611, + "epoch": 0.7109962123489448, + "flos": 24061909317120.0, + "grad_norm": 1.6128770144935078, + "language_loss": 0.78372443, + "learning_rate": 8.138673553912751e-07, + "loss": 0.80360067, + "num_input_tokens_seen": 127095570, + "step": 5913, + "time_per_iteration": 2.762751579284668 + }, + { + "auxiliary_loss_clip": 0.01082962, + "auxiliary_loss_mlp": 0.01084299, + "balance_loss_clip": 1.02385807, + "balance_loss_mlp": 1.00404727, + "epoch": 0.711116455239584, + "flos": 30480474326400.0, + "grad_norm": 2.8859161013881605, + "language_loss": 0.56342512, + "learning_rate": 8.132402535401059e-07, + "loss": 0.58509779, + "num_input_tokens_seen": 127116825, + "step": 5914, + "time_per_iteration": 4.80744481086731 + }, + { + "auxiliary_loss_clip": 0.01120557, + "auxiliary_loss_mlp": 0.0108413, + "balance_loss_clip": 1.02164292, + "balance_loss_mlp": 1.0039258, + "epoch": 0.711236698130223, + "flos": 25045610158080.0, + "grad_norm": 1.6222009940005633, + "language_loss": 0.74189526, + "learning_rate": 8.126133317194465e-07, + "loss": 0.76394212, + "num_input_tokens_seen": 127137015, + "step": 5915, + "time_per_iteration": 2.68776535987854 + }, + { + "auxiliary_loss_clip": 0.01071761, + "auxiliary_loss_mlp": 0.0108385, + "balance_loss_clip": 1.02096593, + "balance_loss_mlp": 1.00350344, + "epoch": 0.7113569410208621, + "flos": 24206701040640.0, + "grad_norm": 1.8249133155221218, + "language_loss": 0.74295306, + "learning_rate": 8.11986590024401e-07, + "loss": 0.7645092, + "num_input_tokens_seen": 127156755, + "step": 5916, + "time_per_iteration": 3.8413777351379395 + }, + { + "auxiliary_loss_clip": 0.01113014, + "auxiliary_loss_mlp": 0.01085072, + "balance_loss_clip": 1.0222286, + "balance_loss_mlp": 1.0047245, + "epoch": 0.7114771839115013, + "flos": 35439306526080.0, + "grad_norm": 1.5267107647461104, + "language_loss": 0.69016135, + "learning_rate": 8.113600285500442e-07, + "loss": 0.71214223, + "num_input_tokens_seen": 127176965, + "step": 5917, + "time_per_iteration": 2.963841676712036 + }, + { + "auxiliary_loss_clip": 0.01135502, + "auxiliary_loss_mlp": 0.01084064, + "balance_loss_clip": 1.02591586, + "balance_loss_mlp": 1.00386024, + "epoch": 0.7115974268021403, + "flos": 21099458096640.0, + "grad_norm": 1.6387930759702825, + "language_loss": 0.74226683, + "learning_rate": 8.107336473914268e-07, + "loss": 0.76446253, + "num_input_tokens_seen": 127195595, + "step": 5918, + "time_per_iteration": 2.5774879455566406 + }, + { + "auxiliary_loss_clip": 0.01098103, + "auxiliary_loss_mlp": 0.01078764, + "balance_loss_clip": 1.01900995, + "balance_loss_mlp": 0.99979937, + "epoch": 0.7117176696927794, + "flos": 56752866616320.0, + "grad_norm": 0.7743822558828475, + "language_loss": 0.557037, + "learning_rate": 8.101074466435694e-07, + "loss": 0.57880569, + "num_input_tokens_seen": 127255070, + "step": 5919, + "time_per_iteration": 3.2037289142608643 + }, + { + "auxiliary_loss_clip": 0.0112768, + "auxiliary_loss_mlp": 0.0108427, + "balance_loss_clip": 1.02621996, + "balance_loss_mlp": 1.00406599, + "epoch": 0.7118379125834186, + "flos": 15925269905280.0, + "grad_norm": 1.722676007429898, + "language_loss": 0.67687905, + "learning_rate": 8.094814264014662e-07, + "loss": 0.69899857, + "num_input_tokens_seen": 127273825, + "step": 5920, + "time_per_iteration": 2.6618361473083496 + }, + { + "auxiliary_loss_clip": 0.01135317, + "auxiliary_loss_mlp": 0.01084067, + "balance_loss_clip": 1.02537739, + "balance_loss_mlp": 1.00381505, + "epoch": 0.7119581554740576, + "flos": 20193360589440.0, + "grad_norm": 2.1950867262457683, + "language_loss": 0.81132936, + "learning_rate": 8.088555867600844e-07, + "loss": 0.83352315, + "num_input_tokens_seen": 127289990, + "step": 5921, + "time_per_iteration": 2.6472866535186768 + }, + { + "auxiliary_loss_clip": 0.01109556, + "auxiliary_loss_mlp": 0.0108492, + "balance_loss_clip": 1.02676988, + "balance_loss_mlp": 1.00485873, + "epoch": 0.7120783983646967, + "flos": 34715383822080.0, + "grad_norm": 1.7693782773745288, + "language_loss": 0.6013329, + "learning_rate": 8.08229927814362e-07, + "loss": 0.62327766, + "num_input_tokens_seen": 127312880, + "step": 5922, + "time_per_iteration": 2.9172658920288086 + }, + { + "auxiliary_loss_clip": 0.01106275, + "auxiliary_loss_mlp": 0.0108306, + "balance_loss_clip": 1.02316308, + "balance_loss_mlp": 1.00304651, + "epoch": 0.7121986412553358, + "flos": 26359114700160.0, + "grad_norm": 1.6821777785448229, + "language_loss": 0.65152174, + "learning_rate": 8.076044496592134e-07, + "loss": 0.67341506, + "num_input_tokens_seen": 127334730, + "step": 5923, + "time_per_iteration": 2.9253060817718506 + }, + { + "auxiliary_loss_clip": 0.01113592, + "auxiliary_loss_mlp": 0.01085039, + "balance_loss_clip": 1.02362657, + "balance_loss_mlp": 1.00488257, + "epoch": 0.7123188841459749, + "flos": 11145344371200.0, + "grad_norm": 2.27677532271751, + "language_loss": 0.7801851, + "learning_rate": 8.069791523895204e-07, + "loss": 0.80217135, + "num_input_tokens_seen": 127351180, + "step": 5924, + "time_per_iteration": 2.7113046646118164 + }, + { + "auxiliary_loss_clip": 0.01107311, + "auxiliary_loss_mlp": 0.01084834, + "balance_loss_clip": 1.02339053, + "balance_loss_mlp": 1.00458264, + "epoch": 0.7124391270366139, + "flos": 20811670329600.0, + "grad_norm": 1.7146970022541643, + "language_loss": 0.77311832, + "learning_rate": 8.063540361001422e-07, + "loss": 0.79503977, + "num_input_tokens_seen": 127369750, + "step": 5925, + "time_per_iteration": 2.755711078643799 + }, + { + "auxiliary_loss_clip": 0.01106274, + "auxiliary_loss_mlp": 0.01084098, + "balance_loss_clip": 1.0231117, + "balance_loss_mlp": 1.00394177, + "epoch": 0.7125593699272531, + "flos": 17603734584960.0, + "grad_norm": 1.9564507760004803, + "language_loss": 0.79584605, + "learning_rate": 8.057291008859069e-07, + "loss": 0.8177498, + "num_input_tokens_seen": 127387910, + "step": 5926, + "time_per_iteration": 2.864978551864624 + }, + { + "auxiliary_loss_clip": 0.01125917, + "auxiliary_loss_mlp": 0.01085174, + "balance_loss_clip": 1.02485371, + "balance_loss_mlp": 1.00506568, + "epoch": 0.7126796128178922, + "flos": 28654057526400.0, + "grad_norm": 2.366776596984615, + "language_loss": 0.68265301, + "learning_rate": 8.051043468416187e-07, + "loss": 0.70476395, + "num_input_tokens_seen": 127409160, + "step": 5927, + "time_per_iteration": 2.7320449352264404 + }, + { + "auxiliary_loss_clip": 0.01134697, + "auxiliary_loss_mlp": 0.01084466, + "balance_loss_clip": 1.02542162, + "balance_loss_mlp": 1.0043571, + "epoch": 0.7127998557085312, + "flos": 16034438315520.0, + "grad_norm": 2.007128738017163, + "language_loss": 0.82213974, + "learning_rate": 8.044797740620506e-07, + "loss": 0.84433132, + "num_input_tokens_seen": 127427765, + "step": 5928, + "time_per_iteration": 2.657747507095337 + }, + { + "auxiliary_loss_clip": 0.01096984, + "auxiliary_loss_mlp": 0.0108412, + "balance_loss_clip": 1.02283502, + "balance_loss_mlp": 1.00401115, + "epoch": 0.7129200985991703, + "flos": 23403271582080.0, + "grad_norm": 1.9132607601655989, + "language_loss": 0.78534019, + "learning_rate": 8.038553826419494e-07, + "loss": 0.8071512, + "num_input_tokens_seen": 127446475, + "step": 5929, + "time_per_iteration": 2.8005530834198 + }, + { + "auxiliary_loss_clip": 0.01133794, + "auxiliary_loss_mlp": 0.01083647, + "balance_loss_clip": 1.02428901, + "balance_loss_mlp": 1.00353873, + "epoch": 0.7130403414898094, + "flos": 21397445326080.0, + "grad_norm": 1.677412056659109, + "language_loss": 0.81016469, + "learning_rate": 8.032311726760364e-07, + "loss": 0.83233911, + "num_input_tokens_seen": 127467695, + "step": 5930, + "time_per_iteration": 2.651571273803711 + }, + { + "auxiliary_loss_clip": 0.01106432, + "auxiliary_loss_mlp": 0.01085306, + "balance_loss_clip": 1.0238986, + "balance_loss_mlp": 1.00505388, + "epoch": 0.7131605843804485, + "flos": 74739045306240.0, + "grad_norm": 1.8658240407048774, + "language_loss": 0.68772292, + "learning_rate": 8.026071442590022e-07, + "loss": 0.70964026, + "num_input_tokens_seen": 127494590, + "step": 5931, + "time_per_iteration": 3.1567368507385254 + }, + { + "auxiliary_loss_clip": 0.01127031, + "auxiliary_loss_mlp": 0.01084907, + "balance_loss_clip": 1.02677476, + "balance_loss_mlp": 1.00479794, + "epoch": 0.7132808272710875, + "flos": 18368739469440.0, + "grad_norm": 1.7915907432017888, + "language_loss": 0.80942535, + "learning_rate": 8.019832974855134e-07, + "loss": 0.83154476, + "num_input_tokens_seen": 127512550, + "step": 5932, + "time_per_iteration": 2.6447577476501465 + }, + { + "auxiliary_loss_clip": 0.01108225, + "auxiliary_loss_mlp": 0.01085441, + "balance_loss_clip": 1.02509284, + "balance_loss_mlp": 1.0052371, + "epoch": 0.7134010701617267, + "flos": 23253380127360.0, + "grad_norm": 2.867376765648743, + "language_loss": 0.82721186, + "learning_rate": 8.013596324502052e-07, + "loss": 0.84914857, + "num_input_tokens_seen": 127531015, + "step": 5933, + "time_per_iteration": 2.7659783363342285 + }, + { + "auxiliary_loss_clip": 0.01124503, + "auxiliary_loss_mlp": 0.01083915, + "balance_loss_clip": 1.02526104, + "balance_loss_mlp": 1.00385404, + "epoch": 0.7135213130523658, + "flos": 23653137565440.0, + "grad_norm": 1.8330217636207522, + "language_loss": 0.78691757, + "learning_rate": 8.007361492476872e-07, + "loss": 0.80900168, + "num_input_tokens_seen": 127550340, + "step": 5934, + "time_per_iteration": 2.630972146987915 + }, + { + "auxiliary_loss_clip": 0.01106989, + "auxiliary_loss_mlp": 0.01083988, + "balance_loss_clip": 1.02208447, + "balance_loss_mlp": 1.00392735, + "epoch": 0.7136415559430048, + "flos": 24790644443520.0, + "grad_norm": 1.5136218141169022, + "language_loss": 0.7897656, + "learning_rate": 8.001128479725426e-07, + "loss": 0.81167537, + "num_input_tokens_seen": 127572245, + "step": 5935, + "time_per_iteration": 2.7872161865234375 + }, + { + "auxiliary_loss_clip": 0.01094236, + "auxiliary_loss_mlp": 0.01084664, + "balance_loss_clip": 1.02376342, + "balance_loss_mlp": 1.00455546, + "epoch": 0.713761798833644, + "flos": 18296954138880.0, + "grad_norm": 1.5416444073616524, + "language_loss": 0.81079519, + "learning_rate": 7.994897287193248e-07, + "loss": 0.8325842, + "num_input_tokens_seen": 127591625, + "step": 5936, + "time_per_iteration": 2.756727457046509 + }, + { + "auxiliary_loss_clip": 0.01124723, + "auxiliary_loss_mlp": 0.01083646, + "balance_loss_clip": 1.0237112, + "balance_loss_mlp": 1.00348997, + "epoch": 0.713882041724283, + "flos": 15558262692480.0, + "grad_norm": 2.3940054982411523, + "language_loss": 0.83900797, + "learning_rate": 7.988667915825605e-07, + "loss": 0.86109173, + "num_input_tokens_seen": 127608690, + "step": 5937, + "time_per_iteration": 3.6019954681396484 + }, + { + "auxiliary_loss_clip": 0.0111769, + "auxiliary_loss_mlp": 0.01084731, + "balance_loss_clip": 1.02524328, + "balance_loss_mlp": 1.00447965, + "epoch": 0.7140022846149221, + "flos": 24061011477120.0, + "grad_norm": 1.9152808465582873, + "language_loss": 0.75485075, + "learning_rate": 7.982440366567491e-07, + "loss": 0.77687496, + "num_input_tokens_seen": 127627180, + "step": 5938, + "time_per_iteration": 2.7536685466766357 + }, + { + "auxiliary_loss_clip": 0.01126424, + "auxiliary_loss_mlp": 0.01083768, + "balance_loss_clip": 1.0250206, + "balance_loss_mlp": 1.00361204, + "epoch": 0.7141225275055613, + "flos": 27891710248320.0, + "grad_norm": 1.5599871885345002, + "language_loss": 0.75043678, + "learning_rate": 7.97621464036361e-07, + "loss": 0.77253872, + "num_input_tokens_seen": 127648940, + "step": 5939, + "time_per_iteration": 3.6853702068328857 + }, + { + "auxiliary_loss_clip": 0.01124721, + "auxiliary_loss_mlp": 0.01084509, + "balance_loss_clip": 1.02343249, + "balance_loss_mlp": 1.00440097, + "epoch": 0.7142427703962003, + "flos": 19682603147520.0, + "grad_norm": 1.6114142546945696, + "language_loss": 0.67817044, + "learning_rate": 7.969990738158417e-07, + "loss": 0.70026278, + "num_input_tokens_seen": 127667350, + "step": 5940, + "time_per_iteration": 3.594208002090454 + }, + { + "auxiliary_loss_clip": 0.0112485, + "auxiliary_loss_mlp": 0.01083755, + "balance_loss_clip": 1.0246259, + "balance_loss_mlp": 1.00364661, + "epoch": 0.7143630132868394, + "flos": 21032377447680.0, + "grad_norm": 1.8529062018991636, + "language_loss": 0.85261863, + "learning_rate": 7.963768660896062e-07, + "loss": 0.87470472, + "num_input_tokens_seen": 127685760, + "step": 5941, + "time_per_iteration": 3.5819220542907715 + }, + { + "auxiliary_loss_clip": 0.01126544, + "auxiliary_loss_mlp": 0.01085826, + "balance_loss_clip": 1.0253998, + "balance_loss_mlp": 1.00562239, + "epoch": 0.7144832561774785, + "flos": 24129923719680.0, + "grad_norm": 1.8474011888346857, + "language_loss": 0.82434583, + "learning_rate": 7.957548409520432e-07, + "loss": 0.84646952, + "num_input_tokens_seen": 127704985, + "step": 5942, + "time_per_iteration": 2.635690212249756 + }, + { + "auxiliary_loss_clip": 0.01105977, + "auxiliary_loss_mlp": 0.01084199, + "balance_loss_clip": 1.02292919, + "balance_loss_mlp": 1.00409091, + "epoch": 0.7146034990681176, + "flos": 16325817442560.0, + "grad_norm": 1.742403740382587, + "language_loss": 0.83673763, + "learning_rate": 7.951329984975135e-07, + "loss": 0.85863942, + "num_input_tokens_seen": 127721925, + "step": 5943, + "time_per_iteration": 2.7469582557678223 + }, + { + "auxiliary_loss_clip": 0.01074499, + "auxiliary_loss_mlp": 0.01078909, + "balance_loss_clip": 1.01987374, + "balance_loss_mlp": 0.99994481, + "epoch": 0.7147237419587567, + "flos": 69627164232960.0, + "grad_norm": 0.7208221450547272, + "language_loss": 0.54274434, + "learning_rate": 7.94511338820349e-07, + "loss": 0.56427842, + "num_input_tokens_seen": 127784230, + "step": 5944, + "time_per_iteration": 3.397655487060547 + }, + { + "auxiliary_loss_clip": 0.01112614, + "auxiliary_loss_mlp": 0.00873, + "balance_loss_clip": 1.02212834, + "balance_loss_mlp": 1.00003791, + "epoch": 0.7148439848493958, + "flos": 22266806198400.0, + "grad_norm": 1.934425502828507, + "language_loss": 0.78318805, + "learning_rate": 7.938898620148575e-07, + "loss": 0.8030442, + "num_input_tokens_seen": 127801990, + "step": 5945, + "time_per_iteration": 2.665792465209961 + }, + { + "auxiliary_loss_clip": 0.01113728, + "auxiliary_loss_mlp": 0.0108338, + "balance_loss_clip": 1.02283418, + "balance_loss_mlp": 1.00322318, + "epoch": 0.7149642277400349, + "flos": 17931383470080.0, + "grad_norm": 1.9645266245256223, + "language_loss": 0.7079258, + "learning_rate": 7.932685681753135e-07, + "loss": 0.7298969, + "num_input_tokens_seen": 127819270, + "step": 5946, + "time_per_iteration": 2.7457289695739746 + }, + { + "auxiliary_loss_clip": 0.01135354, + "auxiliary_loss_mlp": 0.01083829, + "balance_loss_clip": 1.02644491, + "balance_loss_mlp": 1.00372076, + "epoch": 0.7150844706306739, + "flos": 31681937370240.0, + "grad_norm": 1.8719667454401991, + "language_loss": 0.6259048, + "learning_rate": 7.92647457395969e-07, + "loss": 0.64809656, + "num_input_tokens_seen": 127841095, + "step": 5947, + "time_per_iteration": 2.6704607009887695 + }, + { + "auxiliary_loss_clip": 0.01093483, + "auxiliary_loss_mlp": 0.01085303, + "balance_loss_clip": 1.02417243, + "balance_loss_mlp": 1.0051465, + "epoch": 0.7152047135213131, + "flos": 10926217451520.0, + "grad_norm": 1.9508105237481113, + "language_loss": 0.73695183, + "learning_rate": 7.920265297710444e-07, + "loss": 0.75873971, + "num_input_tokens_seen": 127858485, + "step": 5948, + "time_per_iteration": 2.781934976577759 + }, + { + "auxiliary_loss_clip": 0.01126354, + "auxiliary_loss_mlp": 0.0108382, + "balance_loss_clip": 1.02534032, + "balance_loss_mlp": 1.00371099, + "epoch": 0.7153249564119522, + "flos": 20995640812800.0, + "grad_norm": 2.441493082988256, + "language_loss": 0.73331058, + "learning_rate": 7.914057853947363e-07, + "loss": 0.7554124, + "num_input_tokens_seen": 127877665, + "step": 5949, + "time_per_iteration": 2.6785531044006348 + }, + { + "auxiliary_loss_clip": 0.01104132, + "auxiliary_loss_mlp": 0.01085064, + "balance_loss_clip": 1.02080774, + "balance_loss_mlp": 1.00471699, + "epoch": 0.7154451993025912, + "flos": 24243114453120.0, + "grad_norm": 1.7375259730428028, + "language_loss": 0.62909746, + "learning_rate": 7.907852243612089e-07, + "loss": 0.65098941, + "num_input_tokens_seen": 127898070, + "step": 5950, + "time_per_iteration": 2.8183727264404297 + }, + { + "auxiliary_loss_clip": 0.01115358, + "auxiliary_loss_mlp": 0.01084102, + "balance_loss_clip": 1.02384746, + "balance_loss_mlp": 1.00394535, + "epoch": 0.7155654421932304, + "flos": 23330947547520.0, + "grad_norm": 1.8472537526011397, + "language_loss": 0.72197664, + "learning_rate": 7.901648467646009e-07, + "loss": 0.74397129, + "num_input_tokens_seen": 127917010, + "step": 5951, + "time_per_iteration": 2.716705560684204 + }, + { + "auxiliary_loss_clip": 0.01134623, + "auxiliary_loss_mlp": 0.01084041, + "balance_loss_clip": 1.0250349, + "balance_loss_mlp": 1.00383723, + "epoch": 0.7156856850838694, + "flos": 22711883621760.0, + "grad_norm": 1.6251707202257704, + "language_loss": 0.72522902, + "learning_rate": 7.895446526990244e-07, + "loss": 0.74741566, + "num_input_tokens_seen": 127937025, + "step": 5952, + "time_per_iteration": 2.773470401763916 + }, + { + "auxiliary_loss_clip": 0.01099598, + "auxiliary_loss_mlp": 0.01084267, + "balance_loss_clip": 1.02335072, + "balance_loss_mlp": 1.00415874, + "epoch": 0.7158059279745085, + "flos": 19865424395520.0, + "grad_norm": 1.5847937089391897, + "language_loss": 0.75565064, + "learning_rate": 7.889246422585609e-07, + "loss": 0.7774893, + "num_input_tokens_seen": 127956410, + "step": 5953, + "time_per_iteration": 2.875359058380127 + }, + { + "auxiliary_loss_clip": 0.0113468, + "auxiliary_loss_mlp": 0.01084779, + "balance_loss_clip": 1.02519774, + "balance_loss_mlp": 1.00457466, + "epoch": 0.7159261708651476, + "flos": 24134772055680.0, + "grad_norm": 1.6293477601091277, + "language_loss": 0.73231, + "learning_rate": 7.883048155372675e-07, + "loss": 0.75450456, + "num_input_tokens_seen": 127974925, + "step": 5954, + "time_per_iteration": 2.6889023780822754 + }, + { + "auxiliary_loss_clip": 0.011174, + "auxiliary_loss_mlp": 0.01083633, + "balance_loss_clip": 1.0248822, + "balance_loss_mlp": 1.0035249, + "epoch": 0.7160464137557867, + "flos": 16983198201600.0, + "grad_norm": 2.2468495564049067, + "language_loss": 0.71352476, + "learning_rate": 7.876851726291698e-07, + "loss": 0.73553509, + "num_input_tokens_seen": 127993225, + "step": 5955, + "time_per_iteration": 2.7916650772094727 + }, + { + "auxiliary_loss_clip": 0.01109807, + "auxiliary_loss_mlp": 0.01085806, + "balance_loss_clip": 1.02462959, + "balance_loss_mlp": 1.00569749, + "epoch": 0.7161666566464258, + "flos": 25228251838080.0, + "grad_norm": 1.8850116930821128, + "language_loss": 0.78118813, + "learning_rate": 7.870657136282666e-07, + "loss": 0.80314422, + "num_input_tokens_seen": 128012085, + "step": 5956, + "time_per_iteration": 2.801952838897705 + }, + { + "auxiliary_loss_clip": 0.01126533, + "auxiliary_loss_mlp": 0.0108448, + "balance_loss_clip": 1.02528918, + "balance_loss_mlp": 1.00427604, + "epoch": 0.7162868995370649, + "flos": 26468390851200.0, + "grad_norm": 1.4161827131393918, + "language_loss": 0.8192246, + "learning_rate": 7.86446438628531e-07, + "loss": 0.8413347, + "num_input_tokens_seen": 128033155, + "step": 5957, + "time_per_iteration": 2.8183772563934326 + }, + { + "auxiliary_loss_clip": 0.01115259, + "auxiliary_loss_mlp": 0.01079206, + "balance_loss_clip": 1.01982045, + "balance_loss_mlp": 1.00024164, + "epoch": 0.716407142427704, + "flos": 69998912040960.0, + "grad_norm": 0.7603540556604336, + "language_loss": 0.56870961, + "learning_rate": 7.858273477239059e-07, + "loss": 0.59065425, + "num_input_tokens_seen": 128101575, + "step": 5958, + "time_per_iteration": 3.211444854736328 + }, + { + "auxiliary_loss_clip": 0.01092723, + "auxiliary_loss_mlp": 0.01086862, + "balance_loss_clip": 1.02296185, + "balance_loss_mlp": 1.00670612, + "epoch": 0.716527385318343, + "flos": 20740459616640.0, + "grad_norm": 1.6080879000567176, + "language_loss": 0.7132622, + "learning_rate": 7.852084410083067e-07, + "loss": 0.73505807, + "num_input_tokens_seen": 128120395, + "step": 5959, + "time_per_iteration": 2.848658323287964 + }, + { + "auxiliary_loss_clip": 0.01112743, + "auxiliary_loss_mlp": 0.01083463, + "balance_loss_clip": 1.02233243, + "balance_loss_mlp": 1.00335443, + "epoch": 0.7166476282089821, + "flos": 25371966153600.0, + "grad_norm": 1.5645949097473801, + "language_loss": 0.63442779, + "learning_rate": 7.84589718575621e-07, + "loss": 0.65638983, + "num_input_tokens_seen": 128140840, + "step": 5960, + "time_per_iteration": 2.7844462394714355 + }, + { + "auxiliary_loss_clip": 0.01118549, + "auxiliary_loss_mlp": 0.01083975, + "balance_loss_clip": 1.02472198, + "balance_loss_mlp": 1.00381875, + "epoch": 0.7167678710996213, + "flos": 24133730561280.0, + "grad_norm": 2.2877441071866547, + "language_loss": 0.68985933, + "learning_rate": 7.83971180519708e-07, + "loss": 0.71188456, + "num_input_tokens_seen": 128159695, + "step": 5961, + "time_per_iteration": 2.771380662918091 + }, + { + "auxiliary_loss_clip": 0.01137455, + "auxiliary_loss_mlp": 0.01084508, + "balance_loss_clip": 1.02785397, + "balance_loss_mlp": 1.00425637, + "epoch": 0.7168881139902603, + "flos": 30226586019840.0, + "grad_norm": 1.8960107775242525, + "language_loss": 0.75288415, + "learning_rate": 7.833528269344008e-07, + "loss": 0.77510381, + "num_input_tokens_seen": 128179600, + "step": 5962, + "time_per_iteration": 2.7006068229675293 + }, + { + "auxiliary_loss_clip": 0.01104825, + "auxiliary_loss_mlp": 0.01084697, + "balance_loss_clip": 1.0223031, + "balance_loss_mlp": 1.00444531, + "epoch": 0.7170083568808994, + "flos": 14606414236800.0, + "grad_norm": 1.9640822231738733, + "language_loss": 0.77328801, + "learning_rate": 7.827346579135023e-07, + "loss": 0.79518318, + "num_input_tokens_seen": 128196940, + "step": 5963, + "time_per_iteration": 3.6071271896362305 + }, + { + "auxiliary_loss_clip": 0.01114393, + "auxiliary_loss_mlp": 0.01084011, + "balance_loss_clip": 1.02308619, + "balance_loss_mlp": 1.00375891, + "epoch": 0.7171285997715385, + "flos": 23331091201920.0, + "grad_norm": 1.964610151252708, + "language_loss": 0.82824063, + "learning_rate": 7.821166735507885e-07, + "loss": 0.85022461, + "num_input_tokens_seen": 128215970, + "step": 5964, + "time_per_iteration": 3.6909213066101074 + }, + { + "auxiliary_loss_clip": 0.01134941, + "auxiliary_loss_mlp": 0.0108553, + "balance_loss_clip": 1.02581692, + "balance_loss_mlp": 1.00537372, + "epoch": 0.7172488426621776, + "flos": 16543543731840.0, + "grad_norm": 1.5785484666761023, + "language_loss": 0.68784261, + "learning_rate": 7.81498873940007e-07, + "loss": 0.71004736, + "num_input_tokens_seen": 128233185, + "step": 5965, + "time_per_iteration": 3.4614412784576416 + }, + { + "auxiliary_loss_clip": 0.01126609, + "auxiliary_loss_mlp": 0.01083559, + "balance_loss_clip": 1.02480388, + "balance_loss_mlp": 1.00335538, + "epoch": 0.7173690855528166, + "flos": 26541612725760.0, + "grad_norm": 2.0028574779267303, + "language_loss": 0.77425504, + "learning_rate": 7.808812591748768e-07, + "loss": 0.79635674, + "num_input_tokens_seen": 128253565, + "step": 5966, + "time_per_iteration": 3.594895839691162 + }, + { + "auxiliary_loss_clip": 0.01106718, + "auxiliary_loss_mlp": 0.01083886, + "balance_loss_clip": 1.02370894, + "balance_loss_mlp": 1.00368214, + "epoch": 0.7174893284434558, + "flos": 22784099915520.0, + "grad_norm": 1.9404834018297066, + "language_loss": 0.65082043, + "learning_rate": 7.802638293490915e-07, + "loss": 0.67272645, + "num_input_tokens_seen": 128273210, + "step": 5967, + "time_per_iteration": 2.800342559814453 + }, + { + "auxiliary_loss_clip": 0.01118902, + "auxiliary_loss_mlp": 0.0108422, + "balance_loss_clip": 1.0260371, + "balance_loss_mlp": 1.00406408, + "epoch": 0.7176095713340949, + "flos": 23293564467840.0, + "grad_norm": 2.0488430412192775, + "language_loss": 0.76671082, + "learning_rate": 7.796465845563123e-07, + "loss": 0.78874207, + "num_input_tokens_seen": 128292085, + "step": 5968, + "time_per_iteration": 2.7306711673736572 + }, + { + "auxiliary_loss_clip": 0.01110493, + "auxiliary_loss_mlp": 0.00872873, + "balance_loss_clip": 1.02447867, + "balance_loss_mlp": 1.00017381, + "epoch": 0.7177298142247339, + "flos": 25591631777280.0, + "grad_norm": 1.8229260797192, + "language_loss": 0.79368913, + "learning_rate": 7.790295248901766e-07, + "loss": 0.81352282, + "num_input_tokens_seen": 128313215, + "step": 5969, + "time_per_iteration": 2.756324291229248 + }, + { + "auxiliary_loss_clip": 0.01126909, + "auxiliary_loss_mlp": 0.01083419, + "balance_loss_clip": 1.02604735, + "balance_loss_mlp": 1.00321555, + "epoch": 0.7178500571153731, + "flos": 31652778504960.0, + "grad_norm": 1.5466669718414818, + "language_loss": 0.62265557, + "learning_rate": 7.784126504442902e-07, + "loss": 0.64475888, + "num_input_tokens_seen": 128336445, + "step": 5970, + "time_per_iteration": 2.7940995693206787 + }, + { + "auxiliary_loss_clip": 0.01106397, + "auxiliary_loss_mlp": 0.01083432, + "balance_loss_clip": 1.02386701, + "balance_loss_mlp": 1.00332355, + "epoch": 0.7179703000060121, + "flos": 19427242383360.0, + "grad_norm": 1.364955187748583, + "language_loss": 0.6763922, + "learning_rate": 7.777959613122351e-07, + "loss": 0.69829047, + "num_input_tokens_seen": 128356270, + "step": 5971, + "time_per_iteration": 2.770366668701172 + }, + { + "auxiliary_loss_clip": 0.01107307, + "auxiliary_loss_mlp": 0.01084752, + "balance_loss_clip": 1.02295303, + "balance_loss_mlp": 1.00464356, + "epoch": 0.7180905428966512, + "flos": 28839249072000.0, + "grad_norm": 1.7538044447249308, + "language_loss": 0.78115809, + "learning_rate": 7.771794575875604e-07, + "loss": 0.80307871, + "num_input_tokens_seen": 128378140, + "step": 5972, + "time_per_iteration": 2.797199010848999 + }, + { + "auxiliary_loss_clip": 0.01120811, + "auxiliary_loss_mlp": 0.01084586, + "balance_loss_clip": 1.02184176, + "balance_loss_mlp": 1.00433397, + "epoch": 0.7182107857872904, + "flos": 20047563285120.0, + "grad_norm": 2.284931927183298, + "language_loss": 0.77850461, + "learning_rate": 7.765631393637888e-07, + "loss": 0.80055857, + "num_input_tokens_seen": 128396335, + "step": 5973, + "time_per_iteration": 2.6692185401916504 + }, + { + "auxiliary_loss_clip": 0.011266, + "auxiliary_loss_mlp": 0.01084126, + "balance_loss_clip": 1.02518582, + "balance_loss_mlp": 1.00382638, + "epoch": 0.7183310286779294, + "flos": 22747686503040.0, + "grad_norm": 2.6995639216614125, + "language_loss": 0.48091453, + "learning_rate": 7.75947006734417e-07, + "loss": 0.50302184, + "num_input_tokens_seen": 128414115, + "step": 5974, + "time_per_iteration": 2.6954922676086426 + }, + { + "auxiliary_loss_clip": 0.01135514, + "auxiliary_loss_mlp": 0.01083853, + "balance_loss_clip": 1.02548361, + "balance_loss_mlp": 1.00374401, + "epoch": 0.7184512715685685, + "flos": 17158262112000.0, + "grad_norm": 2.579366218291436, + "language_loss": 0.8255192, + "learning_rate": 7.753310597929101e-07, + "loss": 0.84771293, + "num_input_tokens_seen": 128430755, + "step": 5975, + "time_per_iteration": 2.6001627445220947 + }, + { + "auxiliary_loss_clip": 0.01115467, + "auxiliary_loss_mlp": 0.01079108, + "balance_loss_clip": 1.02006388, + "balance_loss_mlp": 1.00014365, + "epoch": 0.7185715144592076, + "flos": 65509611448320.0, + "grad_norm": 0.7593908731388751, + "language_loss": 0.55623025, + "learning_rate": 7.747152986327095e-07, + "loss": 0.57817602, + "num_input_tokens_seen": 128491300, + "step": 5976, + "time_per_iteration": 3.1115291118621826 + }, + { + "auxiliary_loss_clip": 0.01092614, + "auxiliary_loss_mlp": 0.01084354, + "balance_loss_clip": 1.0226903, + "balance_loss_mlp": 1.00434124, + "epoch": 0.7186917573498467, + "flos": 16180522928640.0, + "grad_norm": 1.6593298875730043, + "language_loss": 0.67960083, + "learning_rate": 7.740997233472228e-07, + "loss": 0.70137048, + "num_input_tokens_seen": 128508920, + "step": 5977, + "time_per_iteration": 2.7815568447113037 + }, + { + "auxiliary_loss_clip": 0.0111495, + "auxiliary_loss_mlp": 0.01083332, + "balance_loss_clip": 1.0234735, + "balance_loss_mlp": 1.00322294, + "epoch": 0.7188120002404857, + "flos": 29242274647680.0, + "grad_norm": 3.9603219666210343, + "language_loss": 0.71165627, + "learning_rate": 7.734843340298329e-07, + "loss": 0.73363912, + "num_input_tokens_seen": 128528745, + "step": 5978, + "time_per_iteration": 2.7392513751983643 + }, + { + "auxiliary_loss_clip": 0.01116285, + "auxiliary_loss_mlp": 0.01084348, + "balance_loss_clip": 1.02359927, + "balance_loss_mlp": 1.00404882, + "epoch": 0.7189322431311249, + "flos": 33401161008000.0, + "grad_norm": 2.2022867626806044, + "language_loss": 0.75157964, + "learning_rate": 7.72869130773895e-07, + "loss": 0.77358603, + "num_input_tokens_seen": 128549345, + "step": 5979, + "time_per_iteration": 2.8329527378082275 + }, + { + "auxiliary_loss_clip": 0.01107788, + "auxiliary_loss_mlp": 0.01079036, + "balance_loss_clip": 1.02027488, + "balance_loss_mlp": 1.000072, + "epoch": 0.719052486021764, + "flos": 61351263792000.0, + "grad_norm": 0.7859619160820625, + "language_loss": 0.59416437, + "learning_rate": 7.722541136727343e-07, + "loss": 0.6160326, + "num_input_tokens_seen": 128605360, + "step": 5980, + "time_per_iteration": 3.115037202835083 + }, + { + "auxiliary_loss_clip": 0.01123808, + "auxiliary_loss_mlp": 0.01084914, + "balance_loss_clip": 1.02336597, + "balance_loss_mlp": 1.00471044, + "epoch": 0.719172728912403, + "flos": 15596795007360.0, + "grad_norm": 1.968271736472376, + "language_loss": 0.80547845, + "learning_rate": 7.716392828196483e-07, + "loss": 0.82756567, + "num_input_tokens_seen": 128623160, + "step": 5981, + "time_per_iteration": 2.653428554534912 + }, + { + "auxiliary_loss_clip": 0.01124782, + "auxiliary_loss_mlp": 0.01085609, + "balance_loss_clip": 1.0246315, + "balance_loss_mlp": 1.00540471, + "epoch": 0.7192929718030422, + "flos": 15553162961280.0, + "grad_norm": 2.9487928055428205, + "language_loss": 0.77466196, + "learning_rate": 7.710246383079064e-07, + "loss": 0.79676592, + "num_input_tokens_seen": 128638545, + "step": 5982, + "time_per_iteration": 2.619826316833496 + }, + { + "auxiliary_loss_clip": 0.01117264, + "auxiliary_loss_mlp": 0.01084565, + "balance_loss_clip": 1.02370167, + "balance_loss_mlp": 1.00436115, + "epoch": 0.7194132146936812, + "flos": 21862487733120.0, + "grad_norm": 2.556256844068833, + "language_loss": 0.91912121, + "learning_rate": 7.704101802307492e-07, + "loss": 0.94113952, + "num_input_tokens_seen": 128650845, + "step": 5983, + "time_per_iteration": 2.8163022994995117 + }, + { + "auxiliary_loss_clip": 0.01109253, + "auxiliary_loss_mlp": 0.0108446, + "balance_loss_clip": 1.02582693, + "balance_loss_mlp": 1.00435162, + "epoch": 0.7195334575843203, + "flos": 27338900958720.0, + "grad_norm": 2.222091447966642, + "language_loss": 0.87101191, + "learning_rate": 7.697959086813912e-07, + "loss": 0.8929491, + "num_input_tokens_seen": 128667010, + "step": 5984, + "time_per_iteration": 2.7883386611938477 + }, + { + "auxiliary_loss_clip": 0.01106945, + "auxiliary_loss_mlp": 0.01083738, + "balance_loss_clip": 1.02344894, + "balance_loss_mlp": 1.00367737, + "epoch": 0.7196537004749595, + "flos": 18770615809920.0, + "grad_norm": 1.816120867356952, + "language_loss": 0.80365872, + "learning_rate": 7.691818237530145e-07, + "loss": 0.82556558, + "num_input_tokens_seen": 128685870, + "step": 5985, + "time_per_iteration": 2.7784619331359863 + }, + { + "auxiliary_loss_clip": 0.01101396, + "auxiliary_loss_mlp": 0.01084577, + "balance_loss_clip": 1.02422523, + "balance_loss_mlp": 1.00437295, + "epoch": 0.7197739433655985, + "flos": 24531009960960.0, + "grad_norm": 1.816232203930677, + "language_loss": 0.7739116, + "learning_rate": 7.685679255387774e-07, + "loss": 0.79577136, + "num_input_tokens_seen": 128704185, + "step": 5986, + "time_per_iteration": 2.8394594192504883 + }, + { + "auxiliary_loss_clip": 0.01112825, + "auxiliary_loss_mlp": 0.01083914, + "balance_loss_clip": 1.02217591, + "balance_loss_mlp": 1.00371027, + "epoch": 0.7198941862562376, + "flos": 18040587793920.0, + "grad_norm": 2.1750540002573446, + "language_loss": 0.77015102, + "learning_rate": 7.679542141318065e-07, + "loss": 0.79211843, + "num_input_tokens_seen": 128721290, + "step": 5987, + "time_per_iteration": 2.6951980590820312 + }, + { + "auxiliary_loss_clip": 0.01118213, + "auxiliary_loss_mlp": 0.01083863, + "balance_loss_clip": 1.02455366, + "balance_loss_mlp": 1.00375402, + "epoch": 0.7200144291468767, + "flos": 29022393542400.0, + "grad_norm": 1.6572446022314935, + "language_loss": 0.75699604, + "learning_rate": 7.673406896252013e-07, + "loss": 0.77901685, + "num_input_tokens_seen": 128742665, + "step": 5988, + "time_per_iteration": 3.7048768997192383 + }, + { + "auxiliary_loss_clip": 0.01109016, + "auxiliary_loss_mlp": 0.01085615, + "balance_loss_clip": 1.0245924, + "balance_loss_mlp": 1.00531554, + "epoch": 0.7201346720375158, + "flos": 25374264624000.0, + "grad_norm": 1.5673554031889025, + "language_loss": 0.78294998, + "learning_rate": 7.667273521120347e-07, + "loss": 0.80489624, + "num_input_tokens_seen": 128762225, + "step": 5989, + "time_per_iteration": 2.754042148590088 + }, + { + "auxiliary_loss_clip": 0.0110592, + "auxiliary_loss_mlp": 0.01084776, + "balance_loss_clip": 1.02284253, + "balance_loss_mlp": 1.00457215, + "epoch": 0.7202549149281549, + "flos": 14355614499840.0, + "grad_norm": 1.8694065759259582, + "language_loss": 0.7949667, + "learning_rate": 7.661142016853468e-07, + "loss": 0.81687367, + "num_input_tokens_seen": 128779585, + "step": 5990, + "time_per_iteration": 3.7900049686431885 + }, + { + "auxiliary_loss_clip": 0.01097821, + "auxiliary_loss_mlp": 0.01085142, + "balance_loss_clip": 1.02270818, + "balance_loss_mlp": 1.00493765, + "epoch": 0.7203751578187939, + "flos": 23001682550400.0, + "grad_norm": 3.0540111559722347, + "language_loss": 0.74832249, + "learning_rate": 7.655012384381543e-07, + "loss": 0.77015209, + "num_input_tokens_seen": 128799070, + "step": 5991, + "time_per_iteration": 3.644214153289795 + }, + { + "auxiliary_loss_clip": 0.01115952, + "auxiliary_loss_mlp": 0.01085722, + "balance_loss_clip": 1.02483249, + "balance_loss_mlp": 1.0055176, + "epoch": 0.7204954007094331, + "flos": 23692424065920.0, + "grad_norm": 2.5128683677629557, + "language_loss": 0.81877899, + "learning_rate": 7.648884624634415e-07, + "loss": 0.84079564, + "num_input_tokens_seen": 128817620, + "step": 5992, + "time_per_iteration": 3.677730083465576 + }, + { + "auxiliary_loss_clip": 0.01124219, + "auxiliary_loss_mlp": 0.01083938, + "balance_loss_clip": 1.02426028, + "balance_loss_mlp": 1.00373447, + "epoch": 0.7206156436000721, + "flos": 16253026531200.0, + "grad_norm": 6.189309940138031, + "language_loss": 0.88589281, + "learning_rate": 7.642758738541683e-07, + "loss": 0.90797442, + "num_input_tokens_seen": 128834200, + "step": 5993, + "time_per_iteration": 2.668668270111084 + }, + { + "auxiliary_loss_clip": 0.01107297, + "auxiliary_loss_mlp": 0.01079226, + "balance_loss_clip": 1.02002215, + "balance_loss_mlp": 1.00026226, + "epoch": 0.7207358864907112, + "flos": 54377806504320.0, + "grad_norm": 0.7561233442477968, + "language_loss": 0.60805708, + "learning_rate": 7.636634727032621e-07, + "loss": 0.62992233, + "num_input_tokens_seen": 128891305, + "step": 5994, + "time_per_iteration": 3.1754415035247803 + }, + { + "auxiliary_loss_clip": 0.01110069, + "auxiliary_loss_mlp": 0.0108432, + "balance_loss_clip": 1.02436423, + "balance_loss_mlp": 1.00402069, + "epoch": 0.7208561293813504, + "flos": 19135540033920.0, + "grad_norm": 2.0592873258578304, + "language_loss": 0.78694105, + "learning_rate": 7.630512591036231e-07, + "loss": 0.80888486, + "num_input_tokens_seen": 128910615, + "step": 5995, + "time_per_iteration": 2.7317333221435547 + }, + { + "auxiliary_loss_clip": 0.01125275, + "auxiliary_loss_mlp": 0.01083647, + "balance_loss_clip": 1.02456975, + "balance_loss_mlp": 1.00349033, + "epoch": 0.7209763722719894, + "flos": 17748526308480.0, + "grad_norm": 2.341780019571189, + "language_loss": 0.64522552, + "learning_rate": 7.624392331481255e-07, + "loss": 0.66731471, + "num_input_tokens_seen": 128928270, + "step": 5996, + "time_per_iteration": 2.7666375637054443 + }, + { + "auxiliary_loss_clip": 0.01106903, + "auxiliary_loss_mlp": 0.01078894, + "balance_loss_clip": 1.01993191, + "balance_loss_mlp": 0.9999302, + "epoch": 0.7210966151626285, + "flos": 66819488716800.0, + "grad_norm": 0.7479744843641631, + "language_loss": 0.51884878, + "learning_rate": 7.618273949296115e-07, + "loss": 0.54070675, + "num_input_tokens_seen": 128987780, + "step": 5997, + "time_per_iteration": 3.2031140327453613 + }, + { + "auxiliary_loss_clip": 0.01115763, + "auxiliary_loss_mlp": 0.01084092, + "balance_loss_clip": 1.02359366, + "balance_loss_mlp": 1.00384033, + "epoch": 0.7212168580532676, + "flos": 21141869080320.0, + "grad_norm": 1.921975001658763, + "language_loss": 0.68696666, + "learning_rate": 7.612157445408987e-07, + "loss": 0.70896524, + "num_input_tokens_seen": 129005590, + "step": 5998, + "time_per_iteration": 2.741745710372925 + }, + { + "auxiliary_loss_clip": 0.01113343, + "auxiliary_loss_mlp": 0.01083775, + "balance_loss_clip": 1.02247691, + "balance_loss_mlp": 1.00357115, + "epoch": 0.7213371009439067, + "flos": 22345738335360.0, + "grad_norm": 2.1738943215555984, + "language_loss": 0.74250865, + "learning_rate": 7.606042820747716e-07, + "loss": 0.76447976, + "num_input_tokens_seen": 129021995, + "step": 5999, + "time_per_iteration": 2.740349769592285 + }, + { + "auxiliary_loss_clip": 0.01117315, + "auxiliary_loss_mlp": 0.01084676, + "balance_loss_clip": 1.02479744, + "balance_loss_mlp": 1.00451922, + "epoch": 0.7214573438345457, + "flos": 18515901490560.0, + "grad_norm": 1.73789248368283, + "language_loss": 0.85535455, + "learning_rate": 7.599930076239889e-07, + "loss": 0.87737441, + "num_input_tokens_seen": 129039280, + "step": 6000, + "time_per_iteration": 2.7158355712890625 + }, + { + "auxiliary_loss_clip": 0.0109843, + "auxiliary_loss_mlp": 0.0087288, + "balance_loss_clip": 1.02321768, + "balance_loss_mlp": 1.00011396, + "epoch": 0.7215775867251849, + "flos": 35736108606720.0, + "grad_norm": 1.946642798640249, + "language_loss": 0.70851511, + "learning_rate": 7.593819212812818e-07, + "loss": 0.72822821, + "num_input_tokens_seen": 129060860, + "step": 6001, + "time_per_iteration": 2.9457361698150635 + }, + { + "auxiliary_loss_clip": 0.01125432, + "auxiliary_loss_mlp": 0.0108408, + "balance_loss_clip": 1.02507496, + "balance_loss_mlp": 1.00397146, + "epoch": 0.721697829615824, + "flos": 20372410909440.0, + "grad_norm": 1.8034700348760966, + "language_loss": 0.71712387, + "learning_rate": 7.587710231393508e-07, + "loss": 0.73921901, + "num_input_tokens_seen": 129079215, + "step": 6002, + "time_per_iteration": 2.6657299995422363 + }, + { + "auxiliary_loss_clip": 0.01077421, + "auxiliary_loss_mlp": 0.01083218, + "balance_loss_clip": 1.02084661, + "balance_loss_mlp": 1.00310922, + "epoch": 0.721818072506463, + "flos": 20229809915520.0, + "grad_norm": 1.8824801543581844, + "language_loss": 0.8380214, + "learning_rate": 7.581603132908685e-07, + "loss": 0.85962772, + "num_input_tokens_seen": 129097185, + "step": 6003, + "time_per_iteration": 2.8437671661376953 + }, + { + "auxiliary_loss_clip": 0.01106831, + "auxiliary_loss_mlp": 0.01084238, + "balance_loss_clip": 1.02329278, + "balance_loss_mlp": 1.0040822, + "epoch": 0.7219383153971022, + "flos": 18186887888640.0, + "grad_norm": 1.8679714405754348, + "language_loss": 0.78769886, + "learning_rate": 7.575497918284795e-07, + "loss": 0.80960953, + "num_input_tokens_seen": 129114730, + "step": 6004, + "time_per_iteration": 2.7361130714416504 + }, + { + "auxiliary_loss_clip": 0.01133839, + "auxiliary_loss_mlp": 0.01084605, + "balance_loss_clip": 1.02402651, + "balance_loss_mlp": 1.00435376, + "epoch": 0.7220585582877412, + "flos": 17342124854400.0, + "grad_norm": 2.36718816803693, + "language_loss": 0.74241972, + "learning_rate": 7.569394588447984e-07, + "loss": 0.76460415, + "num_input_tokens_seen": 129131745, + "step": 6005, + "time_per_iteration": 2.679131269454956 + }, + { + "auxiliary_loss_clip": 0.01125801, + "auxiliary_loss_mlp": 0.010833, + "balance_loss_clip": 1.02436602, + "balance_loss_mlp": 1.00319099, + "epoch": 0.7221788011783803, + "flos": 16976338704000.0, + "grad_norm": 2.873160684190321, + "language_loss": 0.77822065, + "learning_rate": 7.563293144324146e-07, + "loss": 0.80031168, + "num_input_tokens_seen": 129147295, + "step": 6006, + "time_per_iteration": 2.6511905193328857 + }, + { + "auxiliary_loss_clip": 0.01135901, + "auxiliary_loss_mlp": 0.01083717, + "balance_loss_clip": 1.026968, + "balance_loss_mlp": 1.00360799, + "epoch": 0.7222990440690195, + "flos": 26286359702400.0, + "grad_norm": 1.7171579029564115, + "language_loss": 0.79976344, + "learning_rate": 7.557193586838834e-07, + "loss": 0.82195961, + "num_input_tokens_seen": 129162660, + "step": 6007, + "time_per_iteration": 2.696669816970825 + }, + { + "auxiliary_loss_clip": 0.01099283, + "auxiliary_loss_mlp": 0.0108406, + "balance_loss_clip": 1.02348042, + "balance_loss_mlp": 1.00390375, + "epoch": 0.7224192869596585, + "flos": 17601687509760.0, + "grad_norm": 2.279781459699928, + "language_loss": 0.71008813, + "learning_rate": 7.551095916917371e-07, + "loss": 0.73192155, + "num_input_tokens_seen": 129179990, + "step": 6008, + "time_per_iteration": 2.710817813873291 + }, + { + "auxiliary_loss_clip": 0.01092257, + "auxiliary_loss_mlp": 0.01085471, + "balance_loss_clip": 1.02331042, + "balance_loss_mlp": 1.00517142, + "epoch": 0.7225395298502976, + "flos": 12932331016320.0, + "grad_norm": 2.3227631460976945, + "language_loss": 0.66489542, + "learning_rate": 7.545000135484758e-07, + "loss": 0.68667269, + "num_input_tokens_seen": 129197425, + "step": 6009, + "time_per_iteration": 2.724005699157715 + }, + { + "auxiliary_loss_clip": 0.01133993, + "auxiliary_loss_mlp": 0.00873042, + "balance_loss_clip": 1.02451777, + "balance_loss_mlp": 1.00011492, + "epoch": 0.7226597727409367, + "flos": 29643899592960.0, + "grad_norm": 1.9236378705081771, + "language_loss": 0.62312055, + "learning_rate": 7.538906243465714e-07, + "loss": 0.64319098, + "num_input_tokens_seen": 129217560, + "step": 6010, + "time_per_iteration": 2.7539494037628174 + }, + { + "auxiliary_loss_clip": 0.01135953, + "auxiliary_loss_mlp": 0.01085109, + "balance_loss_clip": 1.02642536, + "balance_loss_mlp": 1.00495231, + "epoch": 0.7227800156315758, + "flos": 13771635183360.0, + "grad_norm": 1.877638536463119, + "language_loss": 0.78617859, + "learning_rate": 7.5328142417847e-07, + "loss": 0.80838919, + "num_input_tokens_seen": 129234325, + "step": 6011, + "time_per_iteration": 2.5952837467193604 + }, + { + "auxiliary_loss_clip": 0.01126167, + "auxiliary_loss_mlp": 0.01083954, + "balance_loss_clip": 1.02508783, + "balance_loss_mlp": 1.00389314, + "epoch": 0.7229002585222148, + "flos": 20301882554880.0, + "grad_norm": 1.67154167450569, + "language_loss": 0.69027579, + "learning_rate": 7.526724131365838e-07, + "loss": 0.71237701, + "num_input_tokens_seen": 129255280, + "step": 6012, + "time_per_iteration": 2.6983890533447266 + }, + { + "auxiliary_loss_clip": 0.01109986, + "auxiliary_loss_mlp": 0.01084178, + "balance_loss_clip": 1.02465463, + "balance_loss_mlp": 1.00392604, + "epoch": 0.723020501412854, + "flos": 16581250033920.0, + "grad_norm": 1.7131877423640671, + "language_loss": 0.70183402, + "learning_rate": 7.520635913133017e-07, + "loss": 0.72377563, + "num_input_tokens_seen": 129273910, + "step": 6013, + "time_per_iteration": 2.70017147064209 + }, + { + "auxiliary_loss_clip": 0.01125505, + "auxiliary_loss_mlp": 0.0108528, + "balance_loss_clip": 1.02440643, + "balance_loss_mlp": 1.00488544, + "epoch": 0.7231407443034931, + "flos": 28548300908160.0, + "grad_norm": 1.832451128489561, + "language_loss": 0.82449818, + "learning_rate": 7.514549588009798e-07, + "loss": 0.84660602, + "num_input_tokens_seen": 129294785, + "step": 6014, + "time_per_iteration": 3.6902263164520264 + }, + { + "auxiliary_loss_clip": 0.01115828, + "auxiliary_loss_mlp": 0.01085294, + "balance_loss_clip": 1.02387524, + "balance_loss_mlp": 1.00508952, + "epoch": 0.7232609871941321, + "flos": 30008536508160.0, + "grad_norm": 2.237553051983731, + "language_loss": 0.70314264, + "learning_rate": 7.508465156919492e-07, + "loss": 0.72515392, + "num_input_tokens_seen": 129318295, + "step": 6015, + "time_per_iteration": 3.788170576095581 + }, + { + "auxiliary_loss_clip": 0.01116813, + "auxiliary_loss_mlp": 0.01084981, + "balance_loss_clip": 1.0244683, + "balance_loss_mlp": 1.00472939, + "epoch": 0.7233812300847713, + "flos": 16654005031680.0, + "grad_norm": 5.513954708527117, + "language_loss": 0.61275512, + "learning_rate": 7.502382620785083e-07, + "loss": 0.63477302, + "num_input_tokens_seen": 129334845, + "step": 6016, + "time_per_iteration": 3.627199172973633 + }, + { + "auxiliary_loss_clip": 0.01086804, + "auxiliary_loss_mlp": 0.01079049, + "balance_loss_clip": 1.02499104, + "balance_loss_mlp": 1.00008488, + "epoch": 0.7235014729754103, + "flos": 67258784050560.0, + "grad_norm": 0.8038399759989554, + "language_loss": 0.62568092, + "learning_rate": 7.496301980529289e-07, + "loss": 0.64733946, + "num_input_tokens_seen": 129398055, + "step": 6017, + "time_per_iteration": 3.3661937713623047 + }, + { + "auxiliary_loss_clip": 0.01135855, + "auxiliary_loss_mlp": 0.01084916, + "balance_loss_clip": 1.02589965, + "balance_loss_mlp": 1.00471187, + "epoch": 0.7236217158660494, + "flos": 26943237671040.0, + "grad_norm": 3.108781521129674, + "language_loss": 0.7457751, + "learning_rate": 7.490223237074547e-07, + "loss": 0.76798278, + "num_input_tokens_seen": 129417765, + "step": 6018, + "time_per_iteration": 3.5664632320404053 + }, + { + "auxiliary_loss_clip": 0.01107334, + "auxiliary_loss_mlp": 0.01084282, + "balance_loss_clip": 1.02246666, + "balance_loss_mlp": 1.00417387, + "epoch": 0.7237419587566886, + "flos": 29423372042880.0, + "grad_norm": 1.8374737738804834, + "language_loss": 0.65987551, + "learning_rate": 7.484146391342989e-07, + "loss": 0.68179166, + "num_input_tokens_seen": 129437560, + "step": 6019, + "time_per_iteration": 2.802307367324829 + }, + { + "auxiliary_loss_clip": 0.01117419, + "auxiliary_loss_mlp": 0.0108342, + "balance_loss_clip": 1.02427077, + "balance_loss_mlp": 1.00321651, + "epoch": 0.7238622016473276, + "flos": 17821496787840.0, + "grad_norm": 2.324836903174501, + "language_loss": 0.56876975, + "learning_rate": 7.478071444256484e-07, + "loss": 0.59077811, + "num_input_tokens_seen": 129455320, + "step": 6020, + "time_per_iteration": 2.6808981895446777 + }, + { + "auxiliary_loss_clip": 0.01093816, + "auxiliary_loss_mlp": 0.01084589, + "balance_loss_clip": 1.02483869, + "balance_loss_mlp": 1.0044328, + "epoch": 0.7239824445379667, + "flos": 25739117020800.0, + "grad_norm": 1.865457694111468, + "language_loss": 0.79347765, + "learning_rate": 7.471998396736579e-07, + "loss": 0.81526172, + "num_input_tokens_seen": 129475700, + "step": 6021, + "time_per_iteration": 2.7669026851654053 + }, + { + "auxiliary_loss_clip": 0.01106044, + "auxiliary_loss_mlp": 0.0108398, + "balance_loss_clip": 1.02341521, + "balance_loss_mlp": 1.00377584, + "epoch": 0.7241026874286057, + "flos": 23148916398720.0, + "grad_norm": 1.6342298882357356, + "language_loss": 0.75722063, + "learning_rate": 7.465927249704549e-07, + "loss": 0.77912086, + "num_input_tokens_seen": 129493585, + "step": 6022, + "time_per_iteration": 2.744976758956909 + }, + { + "auxiliary_loss_clip": 0.01124357, + "auxiliary_loss_mlp": 0.01084099, + "balance_loss_clip": 1.02388787, + "balance_loss_mlp": 1.00399041, + "epoch": 0.7242229303192449, + "flos": 20266905686400.0, + "grad_norm": 1.8710640927256377, + "language_loss": 0.7723158, + "learning_rate": 7.459858004081398e-07, + "loss": 0.79440033, + "num_input_tokens_seen": 129511555, + "step": 6023, + "time_per_iteration": 2.696571111679077 + }, + { + "auxiliary_loss_clip": 0.01090622, + "auxiliary_loss_mlp": 0.01079058, + "balance_loss_clip": 1.02160358, + "balance_loss_mlp": 1.00009334, + "epoch": 0.724343173209884, + "flos": 62311659684480.0, + "grad_norm": 0.6799686532222688, + "language_loss": 0.58007455, + "learning_rate": 7.453790660787815e-07, + "loss": 0.60177135, + "num_input_tokens_seen": 129579650, + "step": 6024, + "time_per_iteration": 3.4176342487335205 + }, + { + "auxiliary_loss_clip": 0.0111523, + "auxiliary_loss_mlp": 0.01085025, + "balance_loss_clip": 1.02410531, + "balance_loss_mlp": 1.00472581, + "epoch": 0.724463416100523, + "flos": 35006403813120.0, + "grad_norm": 2.319934527093736, + "language_loss": 0.63530135, + "learning_rate": 7.447725220744214e-07, + "loss": 0.65730393, + "num_input_tokens_seen": 129601895, + "step": 6025, + "time_per_iteration": 2.8613967895507812 + }, + { + "auxiliary_loss_clip": 0.0113548, + "auxiliary_loss_mlp": 0.01084756, + "balance_loss_clip": 1.02576184, + "balance_loss_mlp": 1.00459993, + "epoch": 0.7245836589911622, + "flos": 21871968923520.0, + "grad_norm": 2.153293997152825, + "language_loss": 0.77252746, + "learning_rate": 7.441661684870717e-07, + "loss": 0.79472977, + "num_input_tokens_seen": 129622150, + "step": 6026, + "time_per_iteration": 2.6719632148742676 + }, + { + "auxiliary_loss_clip": 0.01134019, + "auxiliary_loss_mlp": 0.010841, + "balance_loss_clip": 1.02475119, + "balance_loss_mlp": 1.00394344, + "epoch": 0.7247039018818012, + "flos": 23006494972800.0, + "grad_norm": 1.6556282322859068, + "language_loss": 0.8168326, + "learning_rate": 7.435600054087152e-07, + "loss": 0.83901381, + "num_input_tokens_seen": 129644315, + "step": 6027, + "time_per_iteration": 2.6817452907562256 + }, + { + "auxiliary_loss_clip": 0.01135517, + "auxiliary_loss_mlp": 0.01083823, + "balance_loss_clip": 1.02617896, + "balance_loss_mlp": 1.00352383, + "epoch": 0.7248241447724403, + "flos": 31722588587520.0, + "grad_norm": 1.838978202164283, + "language_loss": 0.74392533, + "learning_rate": 7.42954032931308e-07, + "loss": 0.76611871, + "num_input_tokens_seen": 129665355, + "step": 6028, + "time_per_iteration": 2.681993007659912 + }, + { + "auxiliary_loss_clip": 0.01114656, + "auxiliary_loss_mlp": 0.01085005, + "balance_loss_clip": 1.02337384, + "balance_loss_mlp": 1.00489628, + "epoch": 0.7249443876630794, + "flos": 34896984007680.0, + "grad_norm": 1.7733095237223682, + "language_loss": 0.74387527, + "learning_rate": 7.423482511467733e-07, + "loss": 0.76587188, + "num_input_tokens_seen": 129686125, + "step": 6029, + "time_per_iteration": 2.8228647708892822 + }, + { + "auxiliary_loss_clip": 0.01082394, + "auxiliary_loss_mlp": 0.01086031, + "balance_loss_clip": 1.0221945, + "balance_loss_mlp": 1.00577903, + "epoch": 0.7250646305537185, + "flos": 26359294268160.0, + "grad_norm": 1.9377823653428214, + "language_loss": 0.64646661, + "learning_rate": 7.417426601470099e-07, + "loss": 0.6681509, + "num_input_tokens_seen": 129706485, + "step": 6030, + "time_per_iteration": 2.8691959381103516 + }, + { + "auxiliary_loss_clip": 0.01125278, + "auxiliary_loss_mlp": 0.01083352, + "balance_loss_clip": 1.02453852, + "balance_loss_mlp": 1.00300515, + "epoch": 0.7251848734443576, + "flos": 30081614728320.0, + "grad_norm": 2.0880070744394197, + "language_loss": 0.7877084, + "learning_rate": 7.411372600238841e-07, + "loss": 0.80979466, + "num_input_tokens_seen": 129727100, + "step": 6031, + "time_per_iteration": 2.710779905319214 + }, + { + "auxiliary_loss_clip": 0.01135484, + "auxiliary_loss_mlp": 0.01084333, + "balance_loss_clip": 1.02567279, + "balance_loss_mlp": 1.00417662, + "epoch": 0.7253051163349967, + "flos": 17785262943360.0, + "grad_norm": 2.0055751573616702, + "language_loss": 0.73565912, + "learning_rate": 7.405320508692346e-07, + "loss": 0.75785726, + "num_input_tokens_seen": 129745840, + "step": 6032, + "time_per_iteration": 2.6448090076446533 + }, + { + "auxiliary_loss_clip": 0.01133749, + "auxiliary_loss_mlp": 0.0108357, + "balance_loss_clip": 1.02515161, + "balance_loss_mlp": 1.00355673, + "epoch": 0.7254253592256358, + "flos": 12641346938880.0, + "grad_norm": 1.9011422574731363, + "language_loss": 0.75335443, + "learning_rate": 7.399270327748727e-07, + "loss": 0.7755276, + "num_input_tokens_seen": 129763500, + "step": 6033, + "time_per_iteration": 2.640808582305908 + }, + { + "auxiliary_loss_clip": 0.01108068, + "auxiliary_loss_mlp": 0.00872785, + "balance_loss_clip": 1.02371407, + "balance_loss_mlp": 1.00016701, + "epoch": 0.7255456021162748, + "flos": 27199208966400.0, + "grad_norm": 6.427447863356703, + "language_loss": 0.74189228, + "learning_rate": 7.39322205832577e-07, + "loss": 0.76170075, + "num_input_tokens_seen": 129784390, + "step": 6034, + "time_per_iteration": 2.882194995880127 + }, + { + "auxiliary_loss_clip": 0.01117149, + "auxiliary_loss_mlp": 0.01084264, + "balance_loss_clip": 1.02495027, + "balance_loss_mlp": 1.00406003, + "epoch": 0.725665845006914, + "flos": 21288205088640.0, + "grad_norm": 1.738287419884083, + "language_loss": 0.80845189, + "learning_rate": 7.387175701341009e-07, + "loss": 0.83046603, + "num_input_tokens_seen": 129803060, + "step": 6035, + "time_per_iteration": 2.6679954528808594 + }, + { + "auxiliary_loss_clip": 0.01124107, + "auxiliary_loss_mlp": 0.01083749, + "balance_loss_clip": 1.02403808, + "balance_loss_mlp": 1.00359297, + "epoch": 0.7257860878975531, + "flos": 16033684129920.0, + "grad_norm": 2.4332962269421246, + "language_loss": 0.72103012, + "learning_rate": 7.381131257711659e-07, + "loss": 0.74310875, + "num_input_tokens_seen": 129820165, + "step": 6036, + "time_per_iteration": 2.6662418842315674 + }, + { + "auxiliary_loss_clip": 0.01110402, + "auxiliary_loss_mlp": 0.01084033, + "balance_loss_clip": 1.02129221, + "balance_loss_mlp": 1.0040195, + "epoch": 0.7259063307881921, + "flos": 12129943052160.0, + "grad_norm": 1.775836335313676, + "language_loss": 0.83347046, + "learning_rate": 7.375088728354677e-07, + "loss": 0.85541487, + "num_input_tokens_seen": 129835195, + "step": 6037, + "time_per_iteration": 2.7134737968444824 + }, + { + "auxiliary_loss_clip": 0.01106139, + "auxiliary_loss_mlp": 0.01083204, + "balance_loss_clip": 1.02230382, + "balance_loss_mlp": 1.00309575, + "epoch": 0.7260265736788313, + "flos": 30443845432320.0, + "grad_norm": 2.046260754072488, + "language_loss": 0.6724537, + "learning_rate": 7.369048114186691e-07, + "loss": 0.69434714, + "num_input_tokens_seen": 129856240, + "step": 6038, + "time_per_iteration": 3.8234968185424805 + }, + { + "auxiliary_loss_clip": 0.01094407, + "auxiliary_loss_mlp": 0.00872682, + "balance_loss_clip": 1.02596903, + "balance_loss_mlp": 1.00012779, + "epoch": 0.7261468165694703, + "flos": 21142264129920.0, + "grad_norm": 1.9126907129790778, + "language_loss": 0.83221942, + "learning_rate": 7.363009416124055e-07, + "loss": 0.85189033, + "num_input_tokens_seen": 129875565, + "step": 6039, + "time_per_iteration": 2.8039698600769043 + }, + { + "auxiliary_loss_clip": 0.01107899, + "auxiliary_loss_mlp": 0.01083585, + "balance_loss_clip": 1.0245223, + "balance_loss_mlp": 1.00338101, + "epoch": 0.7262670594601094, + "flos": 22306308180480.0, + "grad_norm": 2.4197873261670337, + "language_loss": 0.63433206, + "learning_rate": 7.356972635082852e-07, + "loss": 0.65624696, + "num_input_tokens_seen": 129894420, + "step": 6040, + "time_per_iteration": 2.7551746368408203 + }, + { + "auxiliary_loss_clip": 0.01088489, + "auxiliary_loss_mlp": 0.01085368, + "balance_loss_clip": 1.01789057, + "balance_loss_mlp": 1.00525963, + "epoch": 0.7263873023507486, + "flos": 25335049950720.0, + "grad_norm": 1.7072566995297611, + "language_loss": 0.75313032, + "learning_rate": 7.35093777197884e-07, + "loss": 0.77486885, + "num_input_tokens_seen": 129914490, + "step": 6041, + "time_per_iteration": 4.822787761688232 + }, + { + "auxiliary_loss_clip": 0.01116344, + "auxiliary_loss_mlp": 0.01082991, + "balance_loss_clip": 1.02452803, + "balance_loss_mlp": 1.00283468, + "epoch": 0.7265075452413876, + "flos": 23878621192320.0, + "grad_norm": 2.092107507826142, + "language_loss": 0.85092813, + "learning_rate": 7.344904827727525e-07, + "loss": 0.87292153, + "num_input_tokens_seen": 129931670, + "step": 6042, + "time_per_iteration": 2.778331995010376 + }, + { + "auxiliary_loss_clip": 0.01109435, + "auxiliary_loss_mlp": 0.01084323, + "balance_loss_clip": 1.0241946, + "balance_loss_mlp": 1.00416708, + "epoch": 0.7266277881320267, + "flos": 28724549967360.0, + "grad_norm": 5.158523822883245, + "language_loss": 0.734694, + "learning_rate": 7.338873803244076e-07, + "loss": 0.75663161, + "num_input_tokens_seen": 129946905, + "step": 6043, + "time_per_iteration": 3.765888214111328 + }, + { + "auxiliary_loss_clip": 0.01113084, + "auxiliary_loss_mlp": 0.01083966, + "balance_loss_clip": 1.02228808, + "balance_loss_mlp": 1.00380981, + "epoch": 0.7267480310226658, + "flos": 24863507182080.0, + "grad_norm": 1.6412462436759256, + "language_loss": 0.80803323, + "learning_rate": 7.332844699443401e-07, + "loss": 0.83000374, + "num_input_tokens_seen": 129965505, + "step": 6044, + "time_per_iteration": 2.7664177417755127 + }, + { + "auxiliary_loss_clip": 0.01098536, + "auxiliary_loss_mlp": 0.01085227, + "balance_loss_clip": 1.02355552, + "balance_loss_mlp": 1.00511813, + "epoch": 0.7268682739133049, + "flos": 27198490694400.0, + "grad_norm": 1.6927480255297152, + "language_loss": 0.75264549, + "learning_rate": 7.326817517240121e-07, + "loss": 0.77448314, + "num_input_tokens_seen": 129987210, + "step": 6045, + "time_per_iteration": 2.8559508323669434 + }, + { + "auxiliary_loss_clip": 0.01125722, + "auxiliary_loss_mlp": 0.00872829, + "balance_loss_clip": 1.02453589, + "balance_loss_mlp": 1.00011945, + "epoch": 0.7269885168039439, + "flos": 33508138688640.0, + "grad_norm": 1.7204848698848558, + "language_loss": 0.83387804, + "learning_rate": 7.320792257548545e-07, + "loss": 0.8538636, + "num_input_tokens_seen": 130008385, + "step": 6046, + "time_per_iteration": 2.7518105506896973 + }, + { + "auxiliary_loss_clip": 0.01117197, + "auxiliary_loss_mlp": 0.0108383, + "balance_loss_clip": 1.02462482, + "balance_loss_mlp": 1.0035305, + "epoch": 0.7271087596945831, + "flos": 24313750548480.0, + "grad_norm": 1.9965038447892407, + "language_loss": 0.76148778, + "learning_rate": 7.314768921282704e-07, + "loss": 0.78349805, + "num_input_tokens_seen": 130029040, + "step": 6047, + "time_per_iteration": 2.805934429168701 + }, + { + "auxiliary_loss_clip": 0.01126729, + "auxiliary_loss_mlp": 0.01084256, + "balance_loss_clip": 1.02539027, + "balance_loss_mlp": 1.00409997, + "epoch": 0.7272290025852222, + "flos": 23805147922560.0, + "grad_norm": 2.8084822182878497, + "language_loss": 0.72852689, + "learning_rate": 7.30874750935633e-07, + "loss": 0.7506367, + "num_input_tokens_seen": 130048725, + "step": 6048, + "time_per_iteration": 2.6762404441833496 + }, + { + "auxiliary_loss_clip": 0.0110682, + "auxiliary_loss_mlp": 0.01084471, + "balance_loss_clip": 1.0236094, + "balance_loss_mlp": 1.00431418, + "epoch": 0.7273492454758612, + "flos": 16720367408640.0, + "grad_norm": 1.7887638507795216, + "language_loss": 0.79015267, + "learning_rate": 7.30272802268286e-07, + "loss": 0.81206554, + "num_input_tokens_seen": 130065720, + "step": 6049, + "time_per_iteration": 2.7919464111328125 + }, + { + "auxiliary_loss_clip": 0.01080713, + "auxiliary_loss_mlp": 0.01084004, + "balance_loss_clip": 1.02154851, + "balance_loss_mlp": 1.0039432, + "epoch": 0.7274694883665004, + "flos": 28031330413440.0, + "grad_norm": 1.6631148786376029, + "language_loss": 0.76189601, + "learning_rate": 7.29671046217547e-07, + "loss": 0.78354323, + "num_input_tokens_seen": 130084830, + "step": 6050, + "time_per_iteration": 2.8620645999908447 + }, + { + "auxiliary_loss_clip": 0.01107418, + "auxiliary_loss_mlp": 0.01084545, + "balance_loss_clip": 1.02320826, + "balance_loss_mlp": 1.00448358, + "epoch": 0.7275897312571394, + "flos": 30372706546560.0, + "grad_norm": 1.9642868191980658, + "language_loss": 0.81506741, + "learning_rate": 7.290694828746988e-07, + "loss": 0.83698714, + "num_input_tokens_seen": 130104495, + "step": 6051, + "time_per_iteration": 2.8511087894439697 + }, + { + "auxiliary_loss_clip": 0.01108596, + "auxiliary_loss_mlp": 0.01083637, + "balance_loss_clip": 1.02413797, + "balance_loss_mlp": 1.00352859, + "epoch": 0.7277099741477785, + "flos": 19204775498880.0, + "grad_norm": 1.7407639740763112, + "language_loss": 0.8588649, + "learning_rate": 7.284681123310004e-07, + "loss": 0.88078731, + "num_input_tokens_seen": 130123210, + "step": 6052, + "time_per_iteration": 2.765976667404175 + }, + { + "auxiliary_loss_clip": 0.01118839, + "auxiliary_loss_mlp": 0.01084039, + "balance_loss_clip": 1.02415943, + "balance_loss_mlp": 1.00383472, + "epoch": 0.7278302170384175, + "flos": 20667884186880.0, + "grad_norm": 1.623787820594287, + "language_loss": 0.79735553, + "learning_rate": 7.27866934677678e-07, + "loss": 0.81938434, + "num_input_tokens_seen": 130142880, + "step": 6053, + "time_per_iteration": 2.657139778137207 + }, + { + "auxiliary_loss_clip": 0.01089133, + "auxiliary_loss_mlp": 0.01083915, + "balance_loss_clip": 1.02120376, + "balance_loss_mlp": 1.00375855, + "epoch": 0.7279504599290567, + "flos": 19093200877440.0, + "grad_norm": 1.664627757362957, + "language_loss": 0.78476673, + "learning_rate": 7.272659500059297e-07, + "loss": 0.80649722, + "num_input_tokens_seen": 130160220, + "step": 6054, + "time_per_iteration": 2.7869865894317627 + }, + { + "auxiliary_loss_clip": 0.0112752, + "auxiliary_loss_mlp": 0.01085033, + "balance_loss_clip": 1.02602994, + "balance_loss_mlp": 1.00478148, + "epoch": 0.7280707028196958, + "flos": 19062174504960.0, + "grad_norm": 2.7111359851997987, + "language_loss": 0.80201954, + "learning_rate": 7.266651584069264e-07, + "loss": 0.82414508, + "num_input_tokens_seen": 130177885, + "step": 6055, + "time_per_iteration": 2.665783166885376 + }, + { + "auxiliary_loss_clip": 0.01126421, + "auxiliary_loss_mlp": 0.01085555, + "balance_loss_clip": 1.02525949, + "balance_loss_mlp": 1.0054462, + "epoch": 0.7281909457103348, + "flos": 37196308293120.0, + "grad_norm": 1.6499908987026088, + "language_loss": 0.56822956, + "learning_rate": 7.260645599718045e-07, + "loss": 0.59034926, + "num_input_tokens_seen": 130204240, + "step": 6056, + "time_per_iteration": 2.803079605102539 + }, + { + "auxiliary_loss_clip": 0.01117073, + "auxiliary_loss_mlp": 0.01084157, + "balance_loss_clip": 1.02499032, + "balance_loss_mlp": 1.00390589, + "epoch": 0.728311188600974, + "flos": 20667094087680.0, + "grad_norm": 2.7357557963382013, + "language_loss": 0.67069304, + "learning_rate": 7.254641547916767e-07, + "loss": 0.69270527, + "num_input_tokens_seen": 130221735, + "step": 6057, + "time_per_iteration": 2.653323173522949 + }, + { + "auxiliary_loss_clip": 0.01137397, + "auxiliary_loss_mlp": 0.01084049, + "balance_loss_clip": 1.02824759, + "balance_loss_mlp": 1.0038451, + "epoch": 0.728431431491613, + "flos": 28840685616000.0, + "grad_norm": 1.8385668659129726, + "language_loss": 0.68792647, + "learning_rate": 7.248639429576226e-07, + "loss": 0.71014094, + "num_input_tokens_seen": 130241190, + "step": 6058, + "time_per_iteration": 2.70039963722229 + }, + { + "auxiliary_loss_clip": 0.01126263, + "auxiliary_loss_mlp": 0.01084252, + "balance_loss_clip": 1.02518702, + "balance_loss_mlp": 1.00400066, + "epoch": 0.7285516743822521, + "flos": 25991856092160.0, + "grad_norm": 1.6710704329305899, + "language_loss": 0.72035789, + "learning_rate": 7.242639245606959e-07, + "loss": 0.74246305, + "num_input_tokens_seen": 130260980, + "step": 6059, + "time_per_iteration": 2.7192420959472656 + }, + { + "auxiliary_loss_clip": 0.01119236, + "auxiliary_loss_mlp": 0.01085068, + "balance_loss_clip": 1.0259831, + "balance_loss_mlp": 1.0049119, + "epoch": 0.7286719172728913, + "flos": 16399721675520.0, + "grad_norm": 1.7143585738637468, + "language_loss": 0.82473183, + "learning_rate": 7.236640996919168e-07, + "loss": 0.84677494, + "num_input_tokens_seen": 130280025, + "step": 6060, + "time_per_iteration": 2.7200112342834473 + }, + { + "auxiliary_loss_clip": 0.01126008, + "auxiliary_loss_mlp": 0.01083776, + "balance_loss_clip": 1.02492905, + "balance_loss_mlp": 1.00366759, + "epoch": 0.7287921601635303, + "flos": 22018161277440.0, + "grad_norm": 1.5230127033555498, + "language_loss": 0.70483965, + "learning_rate": 7.230644684422782e-07, + "loss": 0.72693747, + "num_input_tokens_seen": 130300255, + "step": 6061, + "time_per_iteration": 2.673022747039795 + }, + { + "auxiliary_loss_clip": 0.01107226, + "auxiliary_loss_mlp": 0.01084617, + "balance_loss_clip": 1.02388, + "balance_loss_mlp": 1.00436497, + "epoch": 0.7289124030541694, + "flos": 24600927784320.0, + "grad_norm": 1.8303167262963354, + "language_loss": 0.81802535, + "learning_rate": 7.224650309027451e-07, + "loss": 0.83994377, + "num_input_tokens_seen": 130320005, + "step": 6062, + "time_per_iteration": 2.795703411102295 + }, + { + "auxiliary_loss_clip": 0.01126454, + "auxiliary_loss_mlp": 0.010845, + "balance_loss_clip": 1.02531707, + "balance_loss_mlp": 1.00443947, + "epoch": 0.7290326459448085, + "flos": 21393638484480.0, + "grad_norm": 1.7633957943717684, + "language_loss": 0.68806213, + "learning_rate": 7.218657871642506e-07, + "loss": 0.71017164, + "num_input_tokens_seen": 130338810, + "step": 6063, + "time_per_iteration": 2.687978982925415 + }, + { + "auxiliary_loss_clip": 0.01136267, + "auxiliary_loss_mlp": 0.01084299, + "balance_loss_clip": 1.02657592, + "balance_loss_mlp": 1.00395179, + "epoch": 0.7291528888354476, + "flos": 18587686821120.0, + "grad_norm": 5.201889254097807, + "language_loss": 0.62202764, + "learning_rate": 7.212667373177012e-07, + "loss": 0.64423335, + "num_input_tokens_seen": 130353805, + "step": 6064, + "time_per_iteration": 3.414767265319824 + }, + { + "auxiliary_loss_clip": 0.01108569, + "auxiliary_loss_mlp": 0.01083957, + "balance_loss_clip": 1.0238359, + "balance_loss_mlp": 1.0038482, + "epoch": 0.7292731317260867, + "flos": 18951066760320.0, + "grad_norm": 3.259228545610044, + "language_loss": 0.75019389, + "learning_rate": 7.206678814539704e-07, + "loss": 0.77211916, + "num_input_tokens_seen": 130372105, + "step": 6065, + "time_per_iteration": 2.7681572437286377 + }, + { + "auxiliary_loss_clip": 0.0109893, + "auxiliary_loss_mlp": 0.01083761, + "balance_loss_clip": 1.02251852, + "balance_loss_mlp": 1.00374818, + "epoch": 0.7293933746167258, + "flos": 21067569797760.0, + "grad_norm": 1.800708786523549, + "language_loss": 0.72654057, + "learning_rate": 7.20069219663904e-07, + "loss": 0.74836749, + "num_input_tokens_seen": 130391990, + "step": 6066, + "time_per_iteration": 3.8233845233917236 + }, + { + "auxiliary_loss_clip": 0.01126393, + "auxiliary_loss_mlp": 0.01083649, + "balance_loss_clip": 1.02505291, + "balance_loss_mlp": 1.00344539, + "epoch": 0.7295136175073649, + "flos": 22453326547200.0, + "grad_norm": 1.5993414495725837, + "language_loss": 0.79577315, + "learning_rate": 7.1947075203832e-07, + "loss": 0.8178736, + "num_input_tokens_seen": 130411970, + "step": 6067, + "time_per_iteration": 3.6138243675231934 + }, + { + "auxiliary_loss_clip": 0.01115933, + "auxiliary_loss_mlp": 0.01078876, + "balance_loss_clip": 1.02060139, + "balance_loss_mlp": 0.99991173, + "epoch": 0.7296338603980039, + "flos": 56125506648960.0, + "grad_norm": 0.8571528187661747, + "language_loss": 0.60162836, + "learning_rate": 7.188724786680049e-07, + "loss": 0.62357646, + "num_input_tokens_seen": 130472440, + "step": 6068, + "time_per_iteration": 4.195640563964844 + }, + { + "auxiliary_loss_clip": 0.01113982, + "auxiliary_loss_mlp": 0.01083758, + "balance_loss_clip": 1.0228647, + "balance_loss_mlp": 1.00369716, + "epoch": 0.7297541032886431, + "flos": 25228287751680.0, + "grad_norm": 1.6997784477263556, + "language_loss": 0.75970709, + "learning_rate": 7.182743996437162e-07, + "loss": 0.78168452, + "num_input_tokens_seen": 130491975, + "step": 6069, + "time_per_iteration": 2.767322301864624 + }, + { + "auxiliary_loss_clip": 0.01108662, + "auxiliary_loss_mlp": 0.01084409, + "balance_loss_clip": 1.02467513, + "balance_loss_mlp": 1.00411022, + "epoch": 0.7298743461792822, + "flos": 26467600752000.0, + "grad_norm": 2.0633228981724887, + "language_loss": 0.68447697, + "learning_rate": 7.176765150561819e-07, + "loss": 0.70640767, + "num_input_tokens_seen": 130510580, + "step": 6070, + "time_per_iteration": 2.8142638206481934 + }, + { + "auxiliary_loss_clip": 0.01133729, + "auxiliary_loss_mlp": 0.01085153, + "balance_loss_clip": 1.02416921, + "balance_loss_mlp": 1.00490141, + "epoch": 0.7299945890699212, + "flos": 19569053278080.0, + "grad_norm": 4.09182214810377, + "language_loss": 0.7949841, + "learning_rate": 7.170788249961002e-07, + "loss": 0.81717294, + "num_input_tokens_seen": 130529090, + "step": 6071, + "time_per_iteration": 2.595759630203247 + }, + { + "auxiliary_loss_clip": 0.01134839, + "auxiliary_loss_mlp": 0.01082902, + "balance_loss_clip": 1.0257057, + "balance_loss_mlp": 1.00274563, + "epoch": 0.7301148319605604, + "flos": 22928963466240.0, + "grad_norm": 2.5515011157310497, + "language_loss": 0.88343751, + "learning_rate": 7.164813295541418e-07, + "loss": 0.90561485, + "num_input_tokens_seen": 130548655, + "step": 6072, + "time_per_iteration": 2.584062099456787 + }, + { + "auxiliary_loss_clip": 0.01114563, + "auxiliary_loss_mlp": 0.01084002, + "balance_loss_clip": 1.02290285, + "balance_loss_mlp": 1.00389373, + "epoch": 0.7302350748511994, + "flos": 25369703596800.0, + "grad_norm": 1.5270379285638223, + "language_loss": 0.70252895, + "learning_rate": 7.15884028820944e-07, + "loss": 0.7245146, + "num_input_tokens_seen": 130567710, + "step": 6073, + "time_per_iteration": 2.742029905319214 + }, + { + "auxiliary_loss_clip": 0.0110747, + "auxiliary_loss_mlp": 0.01083769, + "balance_loss_clip": 1.02339649, + "balance_loss_mlp": 1.0036602, + "epoch": 0.7303553177418385, + "flos": 27819170732160.0, + "grad_norm": 1.956903312538064, + "language_loss": 0.6031518, + "learning_rate": 7.152869228871185e-07, + "loss": 0.62506419, + "num_input_tokens_seen": 130590195, + "step": 6074, + "time_per_iteration": 2.7549798488616943 + }, + { + "auxiliary_loss_clip": 0.01119848, + "auxiliary_loss_mlp": 0.01083929, + "balance_loss_clip": 1.02654088, + "balance_loss_mlp": 1.00372458, + "epoch": 0.7304755606324776, + "flos": 24426510318720.0, + "grad_norm": 1.6754588992626485, + "language_loss": 0.72146028, + "learning_rate": 7.146900118432457e-07, + "loss": 0.74349803, + "num_input_tokens_seen": 130609940, + "step": 6075, + "time_per_iteration": 2.7250378131866455 + }, + { + "auxiliary_loss_clip": 0.01080105, + "auxiliary_loss_mlp": 0.01082973, + "balance_loss_clip": 1.02090693, + "balance_loss_mlp": 1.00286484, + "epoch": 0.7305958035231167, + "flos": 23840483927040.0, + "grad_norm": 8.936300551282475, + "language_loss": 0.85823321, + "learning_rate": 7.140932957798753e-07, + "loss": 0.87986398, + "num_input_tokens_seen": 130628380, + "step": 6076, + "time_per_iteration": 2.8966050148010254 + }, + { + "auxiliary_loss_clip": 0.01119686, + "auxiliary_loss_mlp": 0.0108509, + "balance_loss_clip": 1.02588654, + "balance_loss_mlp": 1.00483775, + "epoch": 0.7307160464137558, + "flos": 16726939597440.0, + "grad_norm": 1.9782527896788993, + "language_loss": 0.71215719, + "learning_rate": 7.134967747875309e-07, + "loss": 0.73420501, + "num_input_tokens_seen": 130646590, + "step": 6077, + "time_per_iteration": 2.6502773761749268 + }, + { + "auxiliary_loss_clip": 0.01126058, + "auxiliary_loss_mlp": 0.01084812, + "balance_loss_clip": 1.02447379, + "balance_loss_mlp": 1.00456035, + "epoch": 0.7308362893043949, + "flos": 21798280172160.0, + "grad_norm": 2.172622608238896, + "language_loss": 0.81724107, + "learning_rate": 7.129004489567014e-07, + "loss": 0.83934975, + "num_input_tokens_seen": 130664070, + "step": 6078, + "time_per_iteration": 2.685554265975952 + }, + { + "auxiliary_loss_clip": 0.01106507, + "auxiliary_loss_mlp": 0.01085175, + "balance_loss_clip": 1.02295566, + "balance_loss_mlp": 1.00501895, + "epoch": 0.730956532195034, + "flos": 10707377840640.0, + "grad_norm": 2.42283484498353, + "language_loss": 0.77634776, + "learning_rate": 7.123043183778512e-07, + "loss": 0.79826456, + "num_input_tokens_seen": 130681400, + "step": 6079, + "time_per_iteration": 2.709778070449829 + }, + { + "auxiliary_loss_clip": 0.01107214, + "auxiliary_loss_mlp": 0.01085841, + "balance_loss_clip": 1.02333665, + "balance_loss_mlp": 1.00573194, + "epoch": 0.731076775085673, + "flos": 19791987039360.0, + "grad_norm": 1.7133886525732607, + "language_loss": 0.65172035, + "learning_rate": 7.117083831414114e-07, + "loss": 0.67365086, + "num_input_tokens_seen": 130700675, + "step": 6080, + "time_per_iteration": 2.729588031768799 + }, + { + "auxiliary_loss_clip": 0.01134268, + "auxiliary_loss_mlp": 0.01083231, + "balance_loss_clip": 1.02504373, + "balance_loss_mlp": 1.00307488, + "epoch": 0.7311970179763122, + "flos": 20447033414400.0, + "grad_norm": 2.024795205555107, + "language_loss": 0.69676399, + "learning_rate": 7.11112643337787e-07, + "loss": 0.71893895, + "num_input_tokens_seen": 130719720, + "step": 6081, + "time_per_iteration": 2.65775465965271 + }, + { + "auxiliary_loss_clip": 0.0111239, + "auxiliary_loss_mlp": 0.01084538, + "balance_loss_clip": 1.02173042, + "balance_loss_mlp": 1.00419104, + "epoch": 0.7313172608669513, + "flos": 18513818501760.0, + "grad_norm": 2.1513720428642316, + "language_loss": 0.76304483, + "learning_rate": 7.10517099057349e-07, + "loss": 0.78501409, + "num_input_tokens_seen": 130736670, + "step": 6082, + "time_per_iteration": 2.7326183319091797 + }, + { + "auxiliary_loss_clip": 0.01110516, + "auxiliary_loss_mlp": 0.01085176, + "balance_loss_clip": 1.02419686, + "balance_loss_mlp": 1.00492442, + "epoch": 0.7314375037575903, + "flos": 16180738410240.0, + "grad_norm": 2.2627547382970836, + "language_loss": 0.61402178, + "learning_rate": 7.099217503904411e-07, + "loss": 0.6359787, + "num_input_tokens_seen": 130754525, + "step": 6083, + "time_per_iteration": 2.6735925674438477 + }, + { + "auxiliary_loss_clip": 0.0111666, + "auxiliary_loss_mlp": 0.01083965, + "balance_loss_clip": 1.02404618, + "balance_loss_mlp": 1.00380898, + "epoch": 0.7315577466482295, + "flos": 17967940536960.0, + "grad_norm": 1.828494140615221, + "language_loss": 0.89892638, + "learning_rate": 7.093265974273788e-07, + "loss": 0.92093259, + "num_input_tokens_seen": 130772420, + "step": 6084, + "time_per_iteration": 2.7476048469543457 + }, + { + "auxiliary_loss_clip": 0.01125558, + "auxiliary_loss_mlp": 0.01083915, + "balance_loss_clip": 1.02372217, + "balance_loss_mlp": 1.00385356, + "epoch": 0.7316779895388685, + "flos": 18405440190720.0, + "grad_norm": 1.8650604838496745, + "language_loss": 0.72113425, + "learning_rate": 7.087316402584447e-07, + "loss": 0.74322903, + "num_input_tokens_seen": 130791245, + "step": 6085, + "time_per_iteration": 2.6261818408966064 + }, + { + "auxiliary_loss_clip": 0.01133936, + "auxiliary_loss_mlp": 0.01084769, + "balance_loss_clip": 1.024629, + "balance_loss_mlp": 1.00451756, + "epoch": 0.7317982324295076, + "flos": 17928294900480.0, + "grad_norm": 2.0543517111974823, + "language_loss": 0.86114848, + "learning_rate": 7.081368789738953e-07, + "loss": 0.88333559, + "num_input_tokens_seen": 130808445, + "step": 6086, + "time_per_iteration": 2.6426634788513184 + }, + { + "auxiliary_loss_clip": 0.01117822, + "auxiliary_loss_mlp": 0.01084855, + "balance_loss_clip": 1.02472591, + "balance_loss_mlp": 1.00465071, + "epoch": 0.7319184753201466, + "flos": 27229840289280.0, + "grad_norm": 2.243086743538158, + "language_loss": 0.77936298, + "learning_rate": 7.075423136639537e-07, + "loss": 0.80138969, + "num_input_tokens_seen": 130827700, + "step": 6087, + "time_per_iteration": 2.7876689434051514 + }, + { + "auxiliary_loss_clip": 0.0110131, + "auxiliary_loss_mlp": 0.01083995, + "balance_loss_clip": 1.02343464, + "balance_loss_mlp": 1.00360084, + "epoch": 0.7320387182107858, + "flos": 37448544574080.0, + "grad_norm": 1.7356288516650027, + "language_loss": 0.75032127, + "learning_rate": 7.069479444188149e-07, + "loss": 0.7721743, + "num_input_tokens_seen": 130848290, + "step": 6088, + "time_per_iteration": 2.8614485263824463 + }, + { + "auxiliary_loss_clip": 0.01116813, + "auxiliary_loss_mlp": 0.01084956, + "balance_loss_clip": 1.0247097, + "balance_loss_mlp": 1.004704, + "epoch": 0.7321589611014249, + "flos": 17859023521920.0, + "grad_norm": 1.6666646033193444, + "language_loss": 0.81896919, + "learning_rate": 7.063537713286453e-07, + "loss": 0.84098691, + "num_input_tokens_seen": 130865970, + "step": 6089, + "time_per_iteration": 3.64089035987854 + }, + { + "auxiliary_loss_clip": 0.01117284, + "auxiliary_loss_mlp": 0.01084252, + "balance_loss_clip": 1.0243814, + "balance_loss_mlp": 1.00409627, + "epoch": 0.7322792039920639, + "flos": 26100593539200.0, + "grad_norm": 1.9944911528222886, + "language_loss": 0.80616021, + "learning_rate": 7.057597944835803e-07, + "loss": 0.8281756, + "num_input_tokens_seen": 130885245, + "step": 6090, + "time_per_iteration": 2.7575409412384033 + }, + { + "auxiliary_loss_clip": 0.0110975, + "auxiliary_loss_mlp": 0.01084019, + "balance_loss_clip": 1.02467251, + "balance_loss_mlp": 1.00400591, + "epoch": 0.7323994468827031, + "flos": 25369093065600.0, + "grad_norm": 1.6600056674321522, + "language_loss": 0.74618429, + "learning_rate": 7.051660139737253e-07, + "loss": 0.76812196, + "num_input_tokens_seen": 130903465, + "step": 6091, + "time_per_iteration": 3.76153302192688 + }, + { + "auxiliary_loss_clip": 0.01119893, + "auxiliary_loss_mlp": 0.00872941, + "balance_loss_clip": 1.02521515, + "balance_loss_mlp": 1.00008774, + "epoch": 0.7325196897733421, + "flos": 26907075653760.0, + "grad_norm": 1.6900220079562445, + "language_loss": 0.76612532, + "learning_rate": 7.045724298891565e-07, + "loss": 0.78605366, + "num_input_tokens_seen": 130922935, + "step": 6092, + "time_per_iteration": 2.7354068756103516 + }, + { + "auxiliary_loss_clip": 0.01125401, + "auxiliary_loss_mlp": 0.01084049, + "balance_loss_clip": 1.02521729, + "balance_loss_mlp": 1.0038451, + "epoch": 0.7326399326639812, + "flos": 25775781828480.0, + "grad_norm": 2.1437581111398774, + "language_loss": 0.69139659, + "learning_rate": 7.039790423199192e-07, + "loss": 0.71349108, + "num_input_tokens_seen": 130942575, + "step": 6093, + "time_per_iteration": 3.603933572769165 + }, + { + "auxiliary_loss_clip": 0.01116582, + "auxiliary_loss_mlp": 0.01085386, + "balance_loss_clip": 1.02440155, + "balance_loss_mlp": 1.00513387, + "epoch": 0.7327601755546204, + "flos": 21032269706880.0, + "grad_norm": 2.2352085841875486, + "language_loss": 0.78255385, + "learning_rate": 7.033858513560322e-07, + "loss": 0.80457348, + "num_input_tokens_seen": 130958870, + "step": 6094, + "time_per_iteration": 3.6897096633911133 + }, + { + "auxiliary_loss_clip": 0.0112731, + "auxiliary_loss_mlp": 0.01084297, + "balance_loss_clip": 1.0267086, + "balance_loss_mlp": 1.00423563, + "epoch": 0.7328804184452594, + "flos": 16289224462080.0, + "grad_norm": 2.1698189798318075, + "language_loss": 0.76069671, + "learning_rate": 7.027928570874794e-07, + "loss": 0.78281283, + "num_input_tokens_seen": 130977060, + "step": 6095, + "time_per_iteration": 2.6765241622924805 + }, + { + "auxiliary_loss_clip": 0.01134027, + "auxiliary_loss_mlp": 0.01083325, + "balance_loss_clip": 1.02462983, + "balance_loss_mlp": 1.00316906, + "epoch": 0.7330006613358985, + "flos": 17858233422720.0, + "grad_norm": 1.936673959098181, + "language_loss": 0.85172409, + "learning_rate": 7.022000596042194e-07, + "loss": 0.87389767, + "num_input_tokens_seen": 130994160, + "step": 6096, + "time_per_iteration": 2.6265599727630615 + }, + { + "auxiliary_loss_clip": 0.01108263, + "auxiliary_loss_mlp": 0.01084047, + "balance_loss_clip": 1.02398956, + "balance_loss_mlp": 1.00389051, + "epoch": 0.7331209042265376, + "flos": 22492074343680.0, + "grad_norm": 2.191617132703474, + "language_loss": 0.82053506, + "learning_rate": 7.016074589961784e-07, + "loss": 0.84245813, + "num_input_tokens_seen": 131012725, + "step": 6097, + "time_per_iteration": 2.7732748985290527 + }, + { + "auxiliary_loss_clip": 0.01111108, + "auxiliary_loss_mlp": 0.01084631, + "balance_loss_clip": 1.0211041, + "balance_loss_mlp": 1.00447512, + "epoch": 0.7332411471171767, + "flos": 33072757937280.0, + "grad_norm": 1.5829280872717402, + "language_loss": 0.66907561, + "learning_rate": 7.01015055353253e-07, + "loss": 0.69103301, + "num_input_tokens_seen": 131035150, + "step": 6098, + "time_per_iteration": 2.7986104488372803 + }, + { + "auxiliary_loss_clip": 0.0109493, + "auxiliary_loss_mlp": 0.01085044, + "balance_loss_clip": 1.02174723, + "balance_loss_mlp": 1.00479245, + "epoch": 0.7333613900078157, + "flos": 22743017735040.0, + "grad_norm": 1.6809695039859196, + "language_loss": 0.77923656, + "learning_rate": 7.004228487653123e-07, + "loss": 0.80103636, + "num_input_tokens_seen": 131055955, + "step": 6099, + "time_per_iteration": 2.8841910362243652 + }, + { + "auxiliary_loss_clip": 0.01109573, + "auxiliary_loss_mlp": 0.01084359, + "balance_loss_clip": 1.02368331, + "balance_loss_mlp": 1.0041548, + "epoch": 0.7334816328984549, + "flos": 22346133384960.0, + "grad_norm": 2.054918622472858, + "language_loss": 0.78311056, + "learning_rate": 6.998308393221906e-07, + "loss": 0.8050499, + "num_input_tokens_seen": 131074360, + "step": 6100, + "time_per_iteration": 2.6713805198669434 + }, + { + "auxiliary_loss_clip": 0.01108643, + "auxiliary_loss_mlp": 0.01083576, + "balance_loss_clip": 1.02492726, + "balance_loss_mlp": 1.00351465, + "epoch": 0.733601875789094, + "flos": 20736149984640.0, + "grad_norm": 2.2121990771544358, + "language_loss": 0.70580459, + "learning_rate": 6.992390271136977e-07, + "loss": 0.72772682, + "num_input_tokens_seen": 131090070, + "step": 6101, + "time_per_iteration": 2.772271156311035 + }, + { + "auxiliary_loss_clip": 0.01126708, + "auxiliary_loss_mlp": 0.01084067, + "balance_loss_clip": 1.02560687, + "balance_loss_mlp": 1.00395799, + "epoch": 0.733722118679733, + "flos": 22564362464640.0, + "grad_norm": 1.7812971567598666, + "language_loss": 0.85577703, + "learning_rate": 6.986474122296094e-07, + "loss": 0.87788475, + "num_input_tokens_seen": 131109185, + "step": 6102, + "time_per_iteration": 2.662278652191162 + }, + { + "auxiliary_loss_clip": 0.01137125, + "auxiliary_loss_mlp": 0.01084238, + "balance_loss_clip": 1.0275898, + "balance_loss_mlp": 1.00393903, + "epoch": 0.7338423615703722, + "flos": 20084192179200.0, + "grad_norm": 1.745120847557619, + "language_loss": 0.72864413, + "learning_rate": 6.980559947596751e-07, + "loss": 0.75085777, + "num_input_tokens_seen": 131127725, + "step": 6103, + "time_per_iteration": 2.690917491912842 + }, + { + "auxiliary_loss_clip": 0.01098901, + "auxiliary_loss_mlp": 0.01084739, + "balance_loss_clip": 1.02402747, + "balance_loss_mlp": 1.00453544, + "epoch": 0.7339626044610112, + "flos": 21687675217920.0, + "grad_norm": 4.292299390457168, + "language_loss": 0.75893056, + "learning_rate": 6.974647747936109e-07, + "loss": 0.78076696, + "num_input_tokens_seen": 131146110, + "step": 6104, + "time_per_iteration": 2.8113856315612793 + }, + { + "auxiliary_loss_clip": 0.01134767, + "auxiliary_loss_mlp": 0.00872926, + "balance_loss_clip": 1.02521682, + "balance_loss_mlp": 1.00011849, + "epoch": 0.7340828473516503, + "flos": 15268248282240.0, + "grad_norm": 2.037566959619273, + "language_loss": 0.82206434, + "learning_rate": 6.968737524211039e-07, + "loss": 0.84214127, + "num_input_tokens_seen": 131162920, + "step": 6105, + "time_per_iteration": 2.5920188426971436 + }, + { + "auxiliary_loss_clip": 0.01126022, + "auxiliary_loss_mlp": 0.01083705, + "balance_loss_clip": 1.02550077, + "balance_loss_mlp": 1.0034529, + "epoch": 0.7342030902422895, + "flos": 22930112701440.0, + "grad_norm": 2.018519905860827, + "language_loss": 0.79835987, + "learning_rate": 6.962829277318132e-07, + "loss": 0.82045716, + "num_input_tokens_seen": 131182515, + "step": 6106, + "time_per_iteration": 2.6645560264587402 + }, + { + "auxiliary_loss_clip": 0.01126422, + "auxiliary_loss_mlp": 0.0108293, + "balance_loss_clip": 1.02581978, + "balance_loss_mlp": 1.00282168, + "epoch": 0.7343233331329285, + "flos": 25847890381440.0, + "grad_norm": 3.583918402939634, + "language_loss": 0.83283848, + "learning_rate": 6.956923008153652e-07, + "loss": 0.85493201, + "num_input_tokens_seen": 131202280, + "step": 6107, + "time_per_iteration": 2.6696934700012207 + }, + { + "auxiliary_loss_clip": 0.01127115, + "auxiliary_loss_mlp": 0.0108361, + "balance_loss_clip": 1.02537417, + "balance_loss_mlp": 1.00350153, + "epoch": 0.7344435760235676, + "flos": 18478985287680.0, + "grad_norm": 1.9424345294447514, + "language_loss": 0.84220827, + "learning_rate": 6.951018717613593e-07, + "loss": 0.86431551, + "num_input_tokens_seen": 131221295, + "step": 6108, + "time_per_iteration": 2.6815719604492188 + }, + { + "auxiliary_loss_clip": 0.01123424, + "auxiliary_loss_mlp": 0.01084612, + "balance_loss_clip": 1.0235486, + "balance_loss_mlp": 1.00450349, + "epoch": 0.7345638189142067, + "flos": 17640040256640.0, + "grad_norm": 1.663130339450311, + "language_loss": 0.77944481, + "learning_rate": 6.945116406593614e-07, + "loss": 0.80152518, + "num_input_tokens_seen": 131240150, + "step": 6109, + "time_per_iteration": 2.6422042846679688 + }, + { + "auxiliary_loss_clip": 0.01090207, + "auxiliary_loss_mlp": 0.01085025, + "balance_loss_clip": 1.02163899, + "balance_loss_mlp": 1.00477362, + "epoch": 0.7346840618048458, + "flos": 20260225756800.0, + "grad_norm": 9.612050166605846, + "language_loss": 0.74553001, + "learning_rate": 6.939216075989089e-07, + "loss": 0.76728237, + "num_input_tokens_seen": 131258080, + "step": 6110, + "time_per_iteration": 2.814406633377075 + }, + { + "auxiliary_loss_clip": 0.01116492, + "auxiliary_loss_mlp": 0.01083647, + "balance_loss_clip": 1.02505875, + "balance_loss_mlp": 1.00344348, + "epoch": 0.7348043046954849, + "flos": 29023183641600.0, + "grad_norm": 1.697815828779639, + "language_loss": 0.66382217, + "learning_rate": 6.933317726695109e-07, + "loss": 0.68582356, + "num_input_tokens_seen": 131279310, + "step": 6111, + "time_per_iteration": 2.7978036403656006 + }, + { + "auxiliary_loss_clip": 0.01100629, + "auxiliary_loss_mlp": 0.01084681, + "balance_loss_clip": 1.01978827, + "balance_loss_mlp": 1.00442898, + "epoch": 0.734924547586124, + "flos": 17931203902080.0, + "grad_norm": 2.4018828281894473, + "language_loss": 0.80015039, + "learning_rate": 6.92742135960644e-07, + "loss": 0.82200348, + "num_input_tokens_seen": 131297010, + "step": 6112, + "time_per_iteration": 2.7357378005981445 + }, + { + "auxiliary_loss_clip": 0.01107189, + "auxiliary_loss_mlp": 0.01079032, + "balance_loss_clip": 1.02000761, + "balance_loss_mlp": 1.00006759, + "epoch": 0.7350447904767631, + "flos": 63588319850880.0, + "grad_norm": 1.0100018712218404, + "language_loss": 0.55671489, + "learning_rate": 6.921526975617556e-07, + "loss": 0.5785771, + "num_input_tokens_seen": 131356470, + "step": 6113, + "time_per_iteration": 3.2884466648101807 + }, + { + "auxiliary_loss_clip": 0.01103101, + "auxiliary_loss_mlp": 0.01084647, + "balance_loss_clip": 1.02571273, + "balance_loss_mlp": 1.00434721, + "epoch": 0.7351650333674021, + "flos": 21580015178880.0, + "grad_norm": 1.6539857096954367, + "language_loss": 0.75379229, + "learning_rate": 6.915634575622631e-07, + "loss": 0.77566969, + "num_input_tokens_seen": 131374985, + "step": 6114, + "time_per_iteration": 3.585230827331543 + }, + { + "auxiliary_loss_clip": 0.01134672, + "auxiliary_loss_mlp": 0.01084888, + "balance_loss_clip": 1.02537489, + "balance_loss_mlp": 1.00468361, + "epoch": 0.7352852762580413, + "flos": 18186349184640.0, + "grad_norm": 2.0514381417567034, + "language_loss": 0.70600605, + "learning_rate": 6.909744160515532e-07, + "loss": 0.72820163, + "num_input_tokens_seen": 131393125, + "step": 6115, + "time_per_iteration": 2.588454008102417 + }, + { + "auxiliary_loss_clip": 0.0111391, + "auxiliary_loss_mlp": 0.01083465, + "balance_loss_clip": 1.02419925, + "balance_loss_mlp": 1.00326061, + "epoch": 0.7354055191486804, + "flos": 38910073063680.0, + "grad_norm": 2.520553075537179, + "language_loss": 0.69336355, + "learning_rate": 6.903855731189849e-07, + "loss": 0.71533728, + "num_input_tokens_seen": 131415760, + "step": 6116, + "time_per_iteration": 3.7614617347717285 + }, + { + "auxiliary_loss_clip": 0.01116214, + "auxiliary_loss_mlp": 0.01084319, + "balance_loss_clip": 1.0233283, + "balance_loss_mlp": 1.00406766, + "epoch": 0.7355257620393194, + "flos": 16289978647680.0, + "grad_norm": 2.594454661164177, + "language_loss": 0.81860852, + "learning_rate": 6.897969288538825e-07, + "loss": 0.84061384, + "num_input_tokens_seen": 131433705, + "step": 6117, + "time_per_iteration": 2.6575725078582764 + }, + { + "auxiliary_loss_clip": 0.01113302, + "auxiliary_loss_mlp": 0.01084195, + "balance_loss_clip": 1.02311683, + "balance_loss_mlp": 1.00408649, + "epoch": 0.7356460049299585, + "flos": 18114240631680.0, + "grad_norm": 1.7564591661552498, + "language_loss": 0.81670785, + "learning_rate": 6.892084833455452e-07, + "loss": 0.83868277, + "num_input_tokens_seen": 131453275, + "step": 6118, + "time_per_iteration": 3.583719253540039 + }, + { + "auxiliary_loss_clip": 0.01125307, + "auxiliary_loss_mlp": 0.01084006, + "balance_loss_clip": 1.02492523, + "balance_loss_mlp": 1.00389719, + "epoch": 0.7357662478205976, + "flos": 21325193118720.0, + "grad_norm": 1.3767485726298099, + "language_loss": 0.83991987, + "learning_rate": 6.886202366832384e-07, + "loss": 0.86201298, + "num_input_tokens_seen": 131474960, + "step": 6119, + "time_per_iteration": 2.674062967300415 + }, + { + "auxiliary_loss_clip": 0.01091546, + "auxiliary_loss_mlp": 0.01084715, + "balance_loss_clip": 1.01935744, + "balance_loss_mlp": 1.0045588, + "epoch": 0.7358864907112367, + "flos": 14246841139200.0, + "grad_norm": 1.6918180182293785, + "language_loss": 0.73595035, + "learning_rate": 6.880321889561987e-07, + "loss": 0.75771296, + "num_input_tokens_seen": 131492935, + "step": 6120, + "time_per_iteration": 3.759708881378174 + }, + { + "auxiliary_loss_clip": 0.01106201, + "auxiliary_loss_mlp": 0.0108336, + "balance_loss_clip": 1.02338409, + "balance_loss_mlp": 1.00301313, + "epoch": 0.7360067336018757, + "flos": 22309684058880.0, + "grad_norm": 1.827992475856212, + "language_loss": 0.65009165, + "learning_rate": 6.874443402536338e-07, + "loss": 0.67198724, + "num_input_tokens_seen": 131512025, + "step": 6121, + "time_per_iteration": 2.8035080432891846 + }, + { + "auxiliary_loss_clip": 0.01116013, + "auxiliary_loss_mlp": 0.01083825, + "balance_loss_clip": 1.0242219, + "balance_loss_mlp": 1.00371659, + "epoch": 0.7361269764925149, + "flos": 25554607833600.0, + "grad_norm": 1.6488926231718317, + "language_loss": 0.80593967, + "learning_rate": 6.868566906647177e-07, + "loss": 0.82793808, + "num_input_tokens_seen": 131532975, + "step": 6122, + "time_per_iteration": 2.7275331020355225 + }, + { + "auxiliary_loss_clip": 0.01127157, + "auxiliary_loss_mlp": 0.0108488, + "balance_loss_clip": 1.02524519, + "balance_loss_mlp": 1.00458074, + "epoch": 0.736247219383154, + "flos": 20376505059840.0, + "grad_norm": 1.9734351273737187, + "language_loss": 0.83507669, + "learning_rate": 6.862692402785984e-07, + "loss": 0.85719705, + "num_input_tokens_seen": 131553225, + "step": 6123, + "time_per_iteration": 2.7526116371154785 + }, + { + "auxiliary_loss_clip": 0.01080368, + "auxiliary_loss_mlp": 0.01079481, + "balance_loss_clip": 1.01021504, + "balance_loss_mlp": 1.00051641, + "epoch": 0.736367462273793, + "flos": 70339525735680.0, + "grad_norm": 0.681667931173518, + "language_loss": 0.49617267, + "learning_rate": 6.856819891843899e-07, + "loss": 0.51777118, + "num_input_tokens_seen": 131617930, + "step": 6124, + "time_per_iteration": 3.414257764816284 + }, + { + "auxiliary_loss_clip": 0.01079205, + "auxiliary_loss_mlp": 0.01084409, + "balance_loss_clip": 1.02063179, + "balance_loss_mlp": 1.00429988, + "epoch": 0.7364877051644322, + "flos": 22412711243520.0, + "grad_norm": 1.9410079428334062, + "language_loss": 0.71824038, + "learning_rate": 6.8509493747118e-07, + "loss": 0.73987651, + "num_input_tokens_seen": 131636740, + "step": 6125, + "time_per_iteration": 2.827826738357544 + }, + { + "auxiliary_loss_clip": 0.01136582, + "auxiliary_loss_mlp": 0.01084505, + "balance_loss_clip": 1.02721536, + "balance_loss_mlp": 1.0043484, + "epoch": 0.7366079480550712, + "flos": 12130266274560.0, + "grad_norm": 2.0814057708198552, + "language_loss": 0.88140309, + "learning_rate": 6.845080852280221e-07, + "loss": 0.90361392, + "num_input_tokens_seen": 131653810, + "step": 6126, + "time_per_iteration": 2.596970558166504 + }, + { + "auxiliary_loss_clip": 0.01106126, + "auxiliary_loss_mlp": 0.01084398, + "balance_loss_clip": 1.02345395, + "balance_loss_mlp": 1.00424135, + "epoch": 0.7367281909457103, + "flos": 15049336844160.0, + "grad_norm": 1.575098681339117, + "language_loss": 0.74314785, + "learning_rate": 6.839214325439409e-07, + "loss": 0.76505303, + "num_input_tokens_seen": 131671505, + "step": 6127, + "time_per_iteration": 2.7815120220184326 + }, + { + "auxiliary_loss_clip": 0.01107403, + "auxiliary_loss_mlp": 0.01085194, + "balance_loss_clip": 1.02295673, + "balance_loss_mlp": 1.0050379, + "epoch": 0.7368484338363495, + "flos": 23510752053120.0, + "grad_norm": 1.7334301277723851, + "language_loss": 0.71522701, + "learning_rate": 6.833349795079327e-07, + "loss": 0.73715305, + "num_input_tokens_seen": 131690615, + "step": 6128, + "time_per_iteration": 2.753567934036255 + }, + { + "auxiliary_loss_clip": 0.01105908, + "auxiliary_loss_mlp": 0.01083779, + "balance_loss_clip": 1.02369988, + "balance_loss_mlp": 1.00357461, + "epoch": 0.7369686767269885, + "flos": 27417833095680.0, + "grad_norm": 1.660514287353378, + "language_loss": 0.68682718, + "learning_rate": 6.827487262089613e-07, + "loss": 0.70872408, + "num_input_tokens_seen": 131711120, + "step": 6129, + "time_per_iteration": 2.8494837284088135 + }, + { + "auxiliary_loss_clip": 0.01102416, + "auxiliary_loss_mlp": 0.01078975, + "balance_loss_clip": 1.02392685, + "balance_loss_mlp": 1.00001097, + "epoch": 0.7370889196176276, + "flos": 70293343824000.0, + "grad_norm": 0.8919165430757928, + "language_loss": 0.5681296, + "learning_rate": 6.821626727359606e-07, + "loss": 0.58994353, + "num_input_tokens_seen": 131776680, + "step": 6130, + "time_per_iteration": 3.3766613006591797 + }, + { + "auxiliary_loss_clip": 0.01110173, + "auxiliary_loss_mlp": 0.01084236, + "balance_loss_clip": 1.0213691, + "balance_loss_mlp": 1.00393677, + "epoch": 0.7372091625082667, + "flos": 18040839189120.0, + "grad_norm": 2.0855260656817918, + "language_loss": 0.77237326, + "learning_rate": 6.815768191778348e-07, + "loss": 0.79431736, + "num_input_tokens_seen": 131794760, + "step": 6131, + "time_per_iteration": 2.6767804622650146 + }, + { + "auxiliary_loss_clip": 0.01127501, + "auxiliary_loss_mlp": 0.0108393, + "balance_loss_clip": 1.02597046, + "balance_loss_mlp": 1.00367856, + "epoch": 0.7373294053989058, + "flos": 33726331854720.0, + "grad_norm": 1.7506129991402304, + "language_loss": 0.72636724, + "learning_rate": 6.809911656234569e-07, + "loss": 0.74848151, + "num_input_tokens_seen": 131816735, + "step": 6132, + "time_per_iteration": 2.780782699584961 + }, + { + "auxiliary_loss_clip": 0.01107971, + "auxiliary_loss_mlp": 0.01084205, + "balance_loss_clip": 1.02272511, + "balance_loss_mlp": 1.00414371, + "epoch": 0.7374496482895448, + "flos": 21506326427520.0, + "grad_norm": 1.8962028843176941, + "language_loss": 0.78059179, + "learning_rate": 6.804057121616707e-07, + "loss": 0.80251354, + "num_input_tokens_seen": 131834940, + "step": 6133, + "time_per_iteration": 2.7126762866973877 + }, + { + "auxiliary_loss_clip": 0.01125926, + "auxiliary_loss_mlp": 0.01083833, + "balance_loss_clip": 1.02499533, + "balance_loss_mlp": 1.00367689, + "epoch": 0.737569891180184, + "flos": 24936908624640.0, + "grad_norm": 3.1274543822973206, + "language_loss": 0.72324008, + "learning_rate": 6.798204588812888e-07, + "loss": 0.74533772, + "num_input_tokens_seen": 131854355, + "step": 6134, + "time_per_iteration": 2.758763074874878 + }, + { + "auxiliary_loss_clip": 0.01086182, + "auxiliary_loss_mlp": 0.00872958, + "balance_loss_clip": 1.02116597, + "balance_loss_mlp": 1.00009036, + "epoch": 0.7376901340708231, + "flos": 20664544222080.0, + "grad_norm": 21.559775067838828, + "language_loss": 0.7571578, + "learning_rate": 6.792354058710937e-07, + "loss": 0.77674913, + "num_input_tokens_seen": 131871825, + "step": 6135, + "time_per_iteration": 2.877533435821533 + }, + { + "auxiliary_loss_clip": 0.01133219, + "auxiliary_loss_mlp": 0.01084638, + "balance_loss_clip": 1.02484632, + "balance_loss_mlp": 1.00452936, + "epoch": 0.7378103769614621, + "flos": 23805794367360.0, + "grad_norm": 1.7311829149016893, + "language_loss": 0.65325916, + "learning_rate": 6.786505532198374e-07, + "loss": 0.67543769, + "num_input_tokens_seen": 131890770, + "step": 6136, + "time_per_iteration": 2.696197032928467 + }, + { + "auxiliary_loss_clip": 0.01135691, + "auxiliary_loss_mlp": 0.01085234, + "balance_loss_clip": 1.02605176, + "balance_loss_mlp": 1.00498247, + "epoch": 0.7379306198521013, + "flos": 22237216369920.0, + "grad_norm": 1.6810594215379255, + "language_loss": 0.85570633, + "learning_rate": 6.780659010162411e-07, + "loss": 0.87791562, + "num_input_tokens_seen": 131909720, + "step": 6137, + "time_per_iteration": 2.6556544303894043 + }, + { + "auxiliary_loss_clip": 0.01105918, + "auxiliary_loss_mlp": 0.01083938, + "balance_loss_clip": 1.02286708, + "balance_loss_mlp": 1.00387716, + "epoch": 0.7380508627427403, + "flos": 14903108576640.0, + "grad_norm": 1.6119325621321747, + "language_loss": 0.83159769, + "learning_rate": 6.774814493489975e-07, + "loss": 0.85349625, + "num_input_tokens_seen": 131927395, + "step": 6138, + "time_per_iteration": 2.7214598655700684 + }, + { + "auxiliary_loss_clip": 0.011233, + "auxiliary_loss_mlp": 0.01083264, + "balance_loss_clip": 1.02349687, + "balance_loss_mlp": 1.00320268, + "epoch": 0.7381711056333794, + "flos": 21685843624320.0, + "grad_norm": 1.7063618102725318, + "language_loss": 0.66082251, + "learning_rate": 6.768971983067655e-07, + "loss": 0.68288815, + "num_input_tokens_seen": 131947725, + "step": 6139, + "time_per_iteration": 3.6103785037994385 + }, + { + "auxiliary_loss_clip": 0.01115165, + "auxiliary_loss_mlp": 0.01079237, + "balance_loss_clip": 1.01987529, + "balance_loss_mlp": 1.00027275, + "epoch": 0.7382913485240186, + "flos": 52404263596800.0, + "grad_norm": 1.0021095811579905, + "language_loss": 0.67784512, + "learning_rate": 6.763131479781772e-07, + "loss": 0.69978917, + "num_input_tokens_seen": 131997485, + "step": 6140, + "time_per_iteration": 3.0114452838897705 + }, + { + "auxiliary_loss_clip": 0.01106472, + "auxiliary_loss_mlp": 0.01085168, + "balance_loss_clip": 1.02232826, + "balance_loss_mlp": 1.00496387, + "epoch": 0.7384115914146576, + "flos": 21798818876160.0, + "grad_norm": 1.8811931160106408, + "language_loss": 0.75992972, + "learning_rate": 6.757292984518316e-07, + "loss": 0.78184611, + "num_input_tokens_seen": 132016885, + "step": 6141, + "time_per_iteration": 2.746553659439087 + }, + { + "auxiliary_loss_clip": 0.01107703, + "auxiliary_loss_mlp": 0.01079243, + "balance_loss_clip": 1.02046335, + "balance_loss_mlp": 1.00027895, + "epoch": 0.7385318343052967, + "flos": 61494331662720.0, + "grad_norm": 0.7468796951036552, + "language_loss": 0.5646565, + "learning_rate": 6.751456498162981e-07, + "loss": 0.58652598, + "num_input_tokens_seen": 132075920, + "step": 6142, + "time_per_iteration": 4.151216268539429 + }, + { + "auxiliary_loss_clip": 0.01125972, + "auxiliary_loss_mlp": 0.01083898, + "balance_loss_clip": 1.02454591, + "balance_loss_mlp": 1.00378966, + "epoch": 0.7386520771959358, + "flos": 17013757697280.0, + "grad_norm": 1.730414871128987, + "language_loss": 0.85447401, + "learning_rate": 6.745622021601174e-07, + "loss": 0.87657273, + "num_input_tokens_seen": 132092945, + "step": 6143, + "time_per_iteration": 2.610212564468384 + }, + { + "auxiliary_loss_clip": 0.01105185, + "auxiliary_loss_mlp": 0.01084412, + "balance_loss_clip": 1.02271271, + "balance_loss_mlp": 1.00430346, + "epoch": 0.7387723200865749, + "flos": 18770759464320.0, + "grad_norm": 1.8683742729957762, + "language_loss": 0.69980699, + "learning_rate": 6.739789555717954e-07, + "loss": 0.72170299, + "num_input_tokens_seen": 132109920, + "step": 6144, + "time_per_iteration": 3.6392457485198975 + }, + { + "auxiliary_loss_clip": 0.01132665, + "auxiliary_loss_mlp": 0.01083658, + "balance_loss_clip": 1.02373719, + "balance_loss_mlp": 1.00350118, + "epoch": 0.738892562977214, + "flos": 22525542840960.0, + "grad_norm": 1.8926053711060626, + "language_loss": 0.77383375, + "learning_rate": 6.733959101398124e-07, + "loss": 0.79599708, + "num_input_tokens_seen": 132128050, + "step": 6145, + "time_per_iteration": 3.6046302318573 + }, + { + "auxiliary_loss_clip": 0.01116609, + "auxiliary_loss_mlp": 0.01084015, + "balance_loss_clip": 1.02326393, + "balance_loss_mlp": 1.0038588, + "epoch": 0.7390128058678531, + "flos": 21501478091520.0, + "grad_norm": 1.5544181274843325, + "language_loss": 0.81139123, + "learning_rate": 6.728130659526143e-07, + "loss": 0.83339745, + "num_input_tokens_seen": 132145860, + "step": 6146, + "time_per_iteration": 2.730757474899292 + }, + { + "auxiliary_loss_clip": 0.01115899, + "auxiliary_loss_mlp": 0.01084656, + "balance_loss_clip": 1.02330089, + "balance_loss_mlp": 1.00449944, + "epoch": 0.7391330487584922, + "flos": 25776176878080.0, + "grad_norm": 2.359951618388174, + "language_loss": 0.71538639, + "learning_rate": 6.7223042309862e-07, + "loss": 0.73739195, + "num_input_tokens_seen": 132166060, + "step": 6147, + "time_per_iteration": 2.777344226837158 + }, + { + "auxiliary_loss_clip": 0.01125678, + "auxiliary_loss_mlp": 0.01084434, + "balance_loss_clip": 1.02429402, + "balance_loss_mlp": 1.00432503, + "epoch": 0.7392532916491312, + "flos": 28366736636160.0, + "grad_norm": 3.143389424057296, + "language_loss": 0.73720038, + "learning_rate": 6.716479816662144e-07, + "loss": 0.75930148, + "num_input_tokens_seen": 132187790, + "step": 6148, + "time_per_iteration": 2.710599660873413 + }, + { + "auxiliary_loss_clip": 0.01117666, + "auxiliary_loss_mlp": 0.01083567, + "balance_loss_clip": 1.02441514, + "balance_loss_mlp": 1.00345874, + "epoch": 0.7393735345397703, + "flos": 23585877348480.0, + "grad_norm": 3.105082099748467, + "language_loss": 0.72919315, + "learning_rate": 6.710657417437531e-07, + "loss": 0.7512055, + "num_input_tokens_seen": 132207495, + "step": 6149, + "time_per_iteration": 2.9166436195373535 + }, + { + "auxiliary_loss_clip": 0.01110285, + "auxiliary_loss_mlp": 0.01083805, + "balance_loss_clip": 1.02375162, + "balance_loss_mlp": 1.00369644, + "epoch": 0.7394937774304094, + "flos": 19974772373760.0, + "grad_norm": 1.9188456041493558, + "language_loss": 0.80153251, + "learning_rate": 6.704837034195628e-07, + "loss": 0.82347345, + "num_input_tokens_seen": 132225960, + "step": 6150, + "time_per_iteration": 2.7802951335906982 + }, + { + "auxiliary_loss_clip": 0.01126418, + "auxiliary_loss_mlp": 0.01083617, + "balance_loss_clip": 1.02507877, + "balance_loss_mlp": 1.00350785, + "epoch": 0.7396140203210485, + "flos": 23478037741440.0, + "grad_norm": 1.673875715020611, + "language_loss": 0.84816396, + "learning_rate": 6.699018667819376e-07, + "loss": 0.87026429, + "num_input_tokens_seen": 132245360, + "step": 6151, + "time_per_iteration": 2.6993324756622314 + }, + { + "auxiliary_loss_clip": 0.01126764, + "auxiliary_loss_mlp": 0.01085094, + "balance_loss_clip": 1.02500379, + "balance_loss_mlp": 1.0047946, + "epoch": 0.7397342632116876, + "flos": 25555433846400.0, + "grad_norm": 1.9610309605901057, + "language_loss": 0.72685683, + "learning_rate": 6.693202319191415e-07, + "loss": 0.7489754, + "num_input_tokens_seen": 132267095, + "step": 6152, + "time_per_iteration": 2.8438920974731445 + }, + { + "auxiliary_loss_clip": 0.01135485, + "auxiliary_loss_mlp": 0.01085302, + "balance_loss_clip": 1.02686846, + "balance_loss_mlp": 1.00514603, + "epoch": 0.7398545061023267, + "flos": 24755021130240.0, + "grad_norm": 1.75801413102893, + "language_loss": 0.74613619, + "learning_rate": 6.687387989194084e-07, + "loss": 0.76834404, + "num_input_tokens_seen": 132286610, + "step": 6153, + "time_per_iteration": 2.649827003479004 + }, + { + "auxiliary_loss_clip": 0.01109443, + "auxiliary_loss_mlp": 0.01083609, + "balance_loss_clip": 1.02074671, + "balance_loss_mlp": 1.00350058, + "epoch": 0.7399747489929658, + "flos": 16508602776960.0, + "grad_norm": 1.975855092097532, + "language_loss": 0.78869724, + "learning_rate": 6.681575678709404e-07, + "loss": 0.8106277, + "num_input_tokens_seen": 132305300, + "step": 6154, + "time_per_iteration": 2.7427709102630615 + }, + { + "auxiliary_loss_clip": 0.01123749, + "auxiliary_loss_mlp": 0.01083919, + "balance_loss_clip": 1.02354717, + "balance_loss_mlp": 1.00381041, + "epoch": 0.7400949918836048, + "flos": 24097065753600.0, + "grad_norm": 1.7193290641366503, + "language_loss": 0.70688212, + "learning_rate": 6.67576538861911e-07, + "loss": 0.72895873, + "num_input_tokens_seen": 132323875, + "step": 6155, + "time_per_iteration": 2.7126126289367676 + }, + { + "auxiliary_loss_clip": 0.01110918, + "auxiliary_loss_mlp": 0.01084594, + "balance_loss_clip": 1.02467465, + "balance_loss_mlp": 1.00448513, + "epoch": 0.740215234774244, + "flos": 21802517976960.0, + "grad_norm": 1.5209237406122367, + "language_loss": 0.82027102, + "learning_rate": 6.669957119804612e-07, + "loss": 0.84222615, + "num_input_tokens_seen": 132345510, + "step": 6156, + "time_per_iteration": 2.783813953399658 + }, + { + "auxiliary_loss_clip": 0.01117229, + "auxiliary_loss_mlp": 0.01084016, + "balance_loss_clip": 1.02417731, + "balance_loss_mlp": 1.00376439, + "epoch": 0.7403354776648831, + "flos": 18733196816640.0, + "grad_norm": 2.889717807569107, + "language_loss": 0.72584367, + "learning_rate": 6.66415087314702e-07, + "loss": 0.74785614, + "num_input_tokens_seen": 132360465, + "step": 6157, + "time_per_iteration": 2.712083339691162 + }, + { + "auxiliary_loss_clip": 0.01116145, + "auxiliary_loss_mlp": 0.01084026, + "balance_loss_clip": 1.02445269, + "balance_loss_mlp": 1.00382185, + "epoch": 0.7404557205555221, + "flos": 16909581277440.0, + "grad_norm": 2.111866257075509, + "language_loss": 0.73014474, + "learning_rate": 6.65834664952714e-07, + "loss": 0.75214648, + "num_input_tokens_seen": 132377915, + "step": 6158, + "time_per_iteration": 2.7477660179138184 + }, + { + "auxiliary_loss_clip": 0.01106582, + "auxiliary_loss_mlp": 0.01083999, + "balance_loss_clip": 1.02294993, + "balance_loss_mlp": 1.00389004, + "epoch": 0.7405759634461613, + "flos": 21214408596480.0, + "grad_norm": 1.5471382041902622, + "language_loss": 0.76018214, + "learning_rate": 6.652544449825457e-07, + "loss": 0.78208798, + "num_input_tokens_seen": 132398170, + "step": 6159, + "time_per_iteration": 2.8044328689575195 + }, + { + "auxiliary_loss_clip": 0.01101103, + "auxiliary_loss_mlp": 0.01084837, + "balance_loss_clip": 1.02462912, + "balance_loss_mlp": 1.00453806, + "epoch": 0.7406962063368003, + "flos": 20480106862080.0, + "grad_norm": 1.5030259809417177, + "language_loss": 0.76356566, + "learning_rate": 6.646744274922182e-07, + "loss": 0.78542507, + "num_input_tokens_seen": 132416615, + "step": 6160, + "time_per_iteration": 2.7271344661712646 + }, + { + "auxiliary_loss_clip": 0.01115811, + "auxiliary_loss_mlp": 0.01083456, + "balance_loss_clip": 1.0234797, + "balance_loss_mlp": 1.00325215, + "epoch": 0.7408164492274394, + "flos": 19791915212160.0, + "grad_norm": 2.963004831258159, + "language_loss": 0.75530481, + "learning_rate": 6.640946125697171e-07, + "loss": 0.77729756, + "num_input_tokens_seen": 132434145, + "step": 6161, + "time_per_iteration": 2.724483013153076 + }, + { + "auxiliary_loss_clip": 0.01124312, + "auxiliary_loss_mlp": 0.01084294, + "balance_loss_clip": 1.02348065, + "balance_loss_mlp": 1.00404274, + "epoch": 0.7409366921180786, + "flos": 29204855654400.0, + "grad_norm": 1.769931410228112, + "language_loss": 0.75720954, + "learning_rate": 6.635150003030017e-07, + "loss": 0.77929556, + "num_input_tokens_seen": 132452670, + "step": 6162, + "time_per_iteration": 2.7431581020355225 + }, + { + "auxiliary_loss_clip": 0.01082219, + "auxiliary_loss_mlp": 0.01084234, + "balance_loss_clip": 1.02318716, + "balance_loss_mlp": 1.00407767, + "epoch": 0.7410569350087176, + "flos": 22930004960640.0, + "grad_norm": 2.0321401706611386, + "language_loss": 0.85852754, + "learning_rate": 6.629355907799981e-07, + "loss": 0.88019204, + "num_input_tokens_seen": 132472475, + "step": 6163, + "time_per_iteration": 2.807000160217285 + }, + { + "auxiliary_loss_clip": 0.01124374, + "auxiliary_loss_mlp": 0.01086135, + "balance_loss_clip": 1.02390695, + "balance_loss_mlp": 1.00597847, + "epoch": 0.7411771778993567, + "flos": 30440397726720.0, + "grad_norm": 1.8997036280527944, + "language_loss": 0.69338185, + "learning_rate": 6.623563840886015e-07, + "loss": 0.71548694, + "num_input_tokens_seen": 132493400, + "step": 6164, + "time_per_iteration": 2.7588727474212646 + }, + { + "auxiliary_loss_clip": 0.01125317, + "auxiliary_loss_mlp": 0.01083257, + "balance_loss_clip": 1.0243659, + "balance_loss_mlp": 1.00329161, + "epoch": 0.7412974207899958, + "flos": 20522050968960.0, + "grad_norm": 1.7200900660155636, + "language_loss": 0.69433528, + "learning_rate": 6.617773803166795e-07, + "loss": 0.71642101, + "num_input_tokens_seen": 132511725, + "step": 6165, + "time_per_iteration": 3.5371458530426025 + }, + { + "auxiliary_loss_clip": 0.01115229, + "auxiliary_loss_mlp": 0.00872974, + "balance_loss_clip": 1.02359462, + "balance_loss_mlp": 1.0001179, + "epoch": 0.7414176636806349, + "flos": 22090700793600.0, + "grad_norm": 2.0939857151877206, + "language_loss": 0.81883723, + "learning_rate": 6.611985795520634e-07, + "loss": 0.83871925, + "num_input_tokens_seen": 132530270, + "step": 6166, + "time_per_iteration": 2.8013429641723633 + }, + { + "auxiliary_loss_clip": 0.01109888, + "auxiliary_loss_mlp": 0.01085287, + "balance_loss_clip": 1.02566123, + "balance_loss_mlp": 1.00513101, + "epoch": 0.7415379065712739, + "flos": 25155245445120.0, + "grad_norm": 2.0465981840725376, + "language_loss": 0.77555376, + "learning_rate": 6.606199818825588e-07, + "loss": 0.7975055, + "num_input_tokens_seen": 132550725, + "step": 6167, + "time_per_iteration": 3.718050956726074 + }, + { + "auxiliary_loss_clip": 0.01116639, + "auxiliary_loss_mlp": 0.01084079, + "balance_loss_clip": 1.0232569, + "balance_loss_mlp": 1.00397027, + "epoch": 0.7416581494619131, + "flos": 16871731320960.0, + "grad_norm": 1.7610490105646177, + "language_loss": 0.81717157, + "learning_rate": 6.600415873959377e-07, + "loss": 0.83917874, + "num_input_tokens_seen": 132568600, + "step": 6168, + "time_per_iteration": 3.607795476913452 + }, + { + "auxiliary_loss_clip": 0.01085434, + "auxiliary_loss_mlp": 0.008728, + "balance_loss_clip": 1.01988995, + "balance_loss_mlp": 1.00016415, + "epoch": 0.7417783923525522, + "flos": 28438881102720.0, + "grad_norm": 1.8189640477697806, + "language_loss": 0.64684379, + "learning_rate": 6.594633961799437e-07, + "loss": 0.66642618, + "num_input_tokens_seen": 132587640, + "step": 6169, + "time_per_iteration": 2.8941473960876465 + }, + { + "auxiliary_loss_clip": 0.01109193, + "auxiliary_loss_mlp": 0.01084815, + "balance_loss_clip": 1.02476299, + "balance_loss_mlp": 1.00465882, + "epoch": 0.7418986352431912, + "flos": 20084299920000.0, + "grad_norm": 1.6282293800471745, + "language_loss": 0.81363118, + "learning_rate": 6.588854083222857e-07, + "loss": 0.83557129, + "num_input_tokens_seen": 132607075, + "step": 6170, + "time_per_iteration": 2.7423324584960938 + }, + { + "auxiliary_loss_clip": 0.01110166, + "auxiliary_loss_mlp": 0.01084134, + "balance_loss_clip": 1.02360094, + "balance_loss_mlp": 1.00378728, + "epoch": 0.7420188781338304, + "flos": 18259571059200.0, + "grad_norm": 3.646450151960591, + "language_loss": 0.80456293, + "learning_rate": 6.583076239106444e-07, + "loss": 0.8265059, + "num_input_tokens_seen": 132625580, + "step": 6171, + "time_per_iteration": 3.6160354614257812 + }, + { + "auxiliary_loss_clip": 0.01116643, + "auxiliary_loss_mlp": 0.01084615, + "balance_loss_clip": 1.02415156, + "balance_loss_mlp": 1.00441062, + "epoch": 0.7421391210244694, + "flos": 13771994319360.0, + "grad_norm": 2.075345191707912, + "language_loss": 0.7541995, + "learning_rate": 6.577300430326707e-07, + "loss": 0.77621204, + "num_input_tokens_seen": 132640525, + "step": 6172, + "time_per_iteration": 2.668872833251953 + }, + { + "auxiliary_loss_clip": 0.01104218, + "auxiliary_loss_mlp": 0.01083787, + "balance_loss_clip": 1.02229929, + "balance_loss_mlp": 1.00372624, + "epoch": 0.7422593639151085, + "flos": 15961683317760.0, + "grad_norm": 2.921409328843306, + "language_loss": 0.71983147, + "learning_rate": 6.571526657759821e-07, + "loss": 0.7417115, + "num_input_tokens_seen": 132656265, + "step": 6173, + "time_per_iteration": 2.7629902362823486 + }, + { + "auxiliary_loss_clip": 0.01126191, + "auxiliary_loss_mlp": 0.01084751, + "balance_loss_clip": 1.02486324, + "balance_loss_mlp": 1.00449967, + "epoch": 0.7423796068057477, + "flos": 30114400867200.0, + "grad_norm": 1.8303444457719291, + "language_loss": 0.70405191, + "learning_rate": 6.565754922281663e-07, + "loss": 0.72616136, + "num_input_tokens_seen": 132678510, + "step": 6174, + "time_per_iteration": 2.707883358001709 + }, + { + "auxiliary_loss_clip": 0.0111695, + "auxiliary_loss_mlp": 0.01084504, + "balance_loss_clip": 1.02370882, + "balance_loss_mlp": 1.00434721, + "epoch": 0.7424998496963867, + "flos": 20521907314560.0, + "grad_norm": 1.6562061558680161, + "language_loss": 0.78307426, + "learning_rate": 6.559985224767801e-07, + "loss": 0.80508876, + "num_input_tokens_seen": 132696385, + "step": 6175, + "time_per_iteration": 2.673466205596924 + }, + { + "auxiliary_loss_clip": 0.01090784, + "auxiliary_loss_mlp": 0.01085049, + "balance_loss_clip": 1.02415764, + "balance_loss_mlp": 1.00479746, + "epoch": 0.7426200925870258, + "flos": 21871573873920.0, + "grad_norm": 2.7369836627424173, + "language_loss": 0.75437176, + "learning_rate": 6.55421756609349e-07, + "loss": 0.77613008, + "num_input_tokens_seen": 132714640, + "step": 6176, + "time_per_iteration": 2.8137378692626953 + }, + { + "auxiliary_loss_clip": 0.01120137, + "auxiliary_loss_mlp": 0.01083729, + "balance_loss_clip": 1.02196717, + "balance_loss_mlp": 1.00362062, + "epoch": 0.7427403354776649, + "flos": 26432049265920.0, + "grad_norm": 1.9147233203468967, + "language_loss": 0.78982621, + "learning_rate": 6.54845194713369e-07, + "loss": 0.81186485, + "num_input_tokens_seen": 132735590, + "step": 6177, + "time_per_iteration": 2.689192771911621 + }, + { + "auxiliary_loss_clip": 0.01128018, + "auxiliary_loss_mlp": 0.01084852, + "balance_loss_clip": 1.02658844, + "balance_loss_mlp": 1.00479066, + "epoch": 0.742860578368304, + "flos": 19898390102400.0, + "grad_norm": 1.956380608206099, + "language_loss": 0.80176729, + "learning_rate": 6.542688368763034e-07, + "loss": 0.82389599, + "num_input_tokens_seen": 132753995, + "step": 6178, + "time_per_iteration": 2.7819859981536865 + }, + { + "auxiliary_loss_clip": 0.01124017, + "auxiliary_loss_mlp": 0.0108461, + "balance_loss_clip": 1.02485454, + "balance_loss_mlp": 1.00450134, + "epoch": 0.742980821258943, + "flos": 24827201510400.0, + "grad_norm": 1.6152119344102707, + "language_loss": 0.77178264, + "learning_rate": 6.536926831855854e-07, + "loss": 0.7938689, + "num_input_tokens_seen": 132773160, + "step": 6179, + "time_per_iteration": 2.7026419639587402 + }, + { + "auxiliary_loss_clip": 0.01112941, + "auxiliary_loss_mlp": 0.01084468, + "balance_loss_clip": 1.02287233, + "balance_loss_mlp": 1.00435972, + "epoch": 0.7431010641495821, + "flos": 25228646887680.0, + "grad_norm": 2.336673643646447, + "language_loss": 0.72967744, + "learning_rate": 6.531167337286165e-07, + "loss": 0.75165153, + "num_input_tokens_seen": 132793180, + "step": 6180, + "time_per_iteration": 2.802279472351074 + }, + { + "auxiliary_loss_clip": 0.01113821, + "auxiliary_loss_mlp": 0.01083524, + "balance_loss_clip": 1.02371693, + "balance_loss_mlp": 1.00341487, + "epoch": 0.7432213070402213, + "flos": 21762369550080.0, + "grad_norm": 1.5231088197628708, + "language_loss": 0.79897332, + "learning_rate": 6.52540988592768e-07, + "loss": 0.82094675, + "num_input_tokens_seen": 132814200, + "step": 6181, + "time_per_iteration": 2.767230987548828 + }, + { + "auxiliary_loss_clip": 0.01115805, + "auxiliary_loss_mlp": 0.01083478, + "balance_loss_clip": 1.02388144, + "balance_loss_mlp": 1.00332165, + "epoch": 0.7433415499308603, + "flos": 14793832425600.0, + "grad_norm": 3.9026788062414264, + "language_loss": 0.832986, + "learning_rate": 6.519654478653814e-07, + "loss": 0.8549788, + "num_input_tokens_seen": 132832565, + "step": 6182, + "time_per_iteration": 2.765800952911377 + }, + { + "auxiliary_loss_clip": 0.01099521, + "auxiliary_loss_mlp": 0.01079439, + "balance_loss_clip": 1.0200696, + "balance_loss_mlp": 1.00047529, + "epoch": 0.7434617928214994, + "flos": 67155577297920.0, + "grad_norm": 0.7511535295580691, + "language_loss": 0.56161427, + "learning_rate": 6.51390111633763e-07, + "loss": 0.58340389, + "num_input_tokens_seen": 132897840, + "step": 6183, + "time_per_iteration": 3.3221182823181152 + }, + { + "auxiliary_loss_clip": 0.01085333, + "auxiliary_loss_mlp": 0.01084333, + "balance_loss_clip": 1.02317142, + "balance_loss_mlp": 1.00417709, + "epoch": 0.7435820357121385, + "flos": 27377576928000.0, + "grad_norm": 1.9463021345751352, + "language_loss": 0.76414502, + "learning_rate": 6.508149799851932e-07, + "loss": 0.7858417, + "num_input_tokens_seen": 132919505, + "step": 6184, + "time_per_iteration": 2.876877546310425 + }, + { + "auxiliary_loss_clip": 0.01114344, + "auxiliary_loss_mlp": 0.01083661, + "balance_loss_clip": 1.02293825, + "balance_loss_mlp": 1.00359964, + "epoch": 0.7437022786027776, + "flos": 23987645948160.0, + "grad_norm": 2.7785894185549016, + "language_loss": 0.61515206, + "learning_rate": 6.502400530069183e-07, + "loss": 0.63713211, + "num_input_tokens_seen": 132939390, + "step": 6185, + "time_per_iteration": 2.715677261352539 + }, + { + "auxiliary_loss_clip": 0.01108844, + "auxiliary_loss_mlp": 0.01083254, + "balance_loss_clip": 1.02562594, + "balance_loss_mlp": 1.00295496, + "epoch": 0.7438225214934167, + "flos": 21866761451520.0, + "grad_norm": 1.7086535113245225, + "language_loss": 0.68509078, + "learning_rate": 6.496653307861535e-07, + "loss": 0.70701182, + "num_input_tokens_seen": 132960060, + "step": 6186, + "time_per_iteration": 2.757763385772705 + }, + { + "auxiliary_loss_clip": 0.01128229, + "auxiliary_loss_mlp": 0.01084734, + "balance_loss_clip": 1.02622008, + "balance_loss_mlp": 1.00452971, + "epoch": 0.7439427643840558, + "flos": 20230097224320.0, + "grad_norm": 1.7923344859341692, + "language_loss": 0.65661633, + "learning_rate": 6.490908134100857e-07, + "loss": 0.67874593, + "num_input_tokens_seen": 132978525, + "step": 6187, + "time_per_iteration": 2.68288516998291 + }, + { + "auxiliary_loss_clip": 0.0112714, + "auxiliary_loss_mlp": 0.01085953, + "balance_loss_clip": 1.02552819, + "balance_loss_mlp": 1.00570107, + "epoch": 0.7440630072746949, + "flos": 20849915335680.0, + "grad_norm": 2.103057260456031, + "language_loss": 0.69485199, + "learning_rate": 6.48516500965866e-07, + "loss": 0.71698296, + "num_input_tokens_seen": 132998460, + "step": 6188, + "time_per_iteration": 2.6674318313598633 + }, + { + "auxiliary_loss_clip": 0.01126553, + "auxiliary_loss_mlp": 0.01084245, + "balance_loss_clip": 1.02474558, + "balance_loss_mlp": 1.00413597, + "epoch": 0.7441832501653339, + "flos": 26503762769280.0, + "grad_norm": 1.5122681457964637, + "language_loss": 0.81648731, + "learning_rate": 6.479423935406192e-07, + "loss": 0.83859527, + "num_input_tokens_seen": 133018445, + "step": 6189, + "time_per_iteration": 2.762939929962158 + }, + { + "auxiliary_loss_clip": 0.01094809, + "auxiliary_loss_mlp": 0.01079189, + "balance_loss_clip": 1.01633501, + "balance_loss_mlp": 1.00022531, + "epoch": 0.7443034930559731, + "flos": 68602848088320.0, + "grad_norm": 1.4794059026620043, + "language_loss": 0.62104958, + "learning_rate": 6.473684912214357e-07, + "loss": 0.64278954, + "num_input_tokens_seen": 133082005, + "step": 6190, + "time_per_iteration": 4.305307149887085 + }, + { + "auxiliary_loss_clip": 0.01124576, + "auxiliary_loss_mlp": 0.01084567, + "balance_loss_clip": 1.02463746, + "balance_loss_mlp": 1.00431585, + "epoch": 0.7444237359466122, + "flos": 18654982951680.0, + "grad_norm": 1.9647034370392957, + "language_loss": 0.69590825, + "learning_rate": 6.467947940953778e-07, + "loss": 0.7179997, + "num_input_tokens_seen": 133100530, + "step": 6191, + "time_per_iteration": 2.644219160079956 + }, + { + "auxiliary_loss_clip": 0.01117389, + "auxiliary_loss_mlp": 0.01084072, + "balance_loss_clip": 1.02542281, + "balance_loss_mlp": 1.00401115, + "epoch": 0.7445439788372512, + "flos": 22817604326400.0, + "grad_norm": 1.6622138938550288, + "language_loss": 0.72483689, + "learning_rate": 6.462213022494732e-07, + "loss": 0.7468515, + "num_input_tokens_seen": 133119775, + "step": 6192, + "time_per_iteration": 2.7732467651367188 + }, + { + "auxiliary_loss_clip": 0.01108114, + "auxiliary_loss_mlp": 0.01079066, + "balance_loss_clip": 1.02057934, + "balance_loss_mlp": 1.00010169, + "epoch": 0.7446642217278904, + "flos": 67045690615680.0, + "grad_norm": 0.7710318066046613, + "language_loss": 0.6105094, + "learning_rate": 6.456480157707201e-07, + "loss": 0.6323812, + "num_input_tokens_seen": 133184550, + "step": 6193, + "time_per_iteration": 5.017182111740112 + }, + { + "auxiliary_loss_clip": 0.01107661, + "auxiliary_loss_mlp": 0.01084117, + "balance_loss_clip": 1.02411437, + "balance_loss_mlp": 1.00400805, + "epoch": 0.7447844646185294, + "flos": 17417465631360.0, + "grad_norm": 1.8835060016689977, + "language_loss": 0.85190076, + "learning_rate": 6.450749347460866e-07, + "loss": 0.87381852, + "num_input_tokens_seen": 133201525, + "step": 6194, + "time_per_iteration": 2.714160919189453 + }, + { + "auxiliary_loss_clip": 0.01135177, + "auxiliary_loss_mlp": 0.01083798, + "balance_loss_clip": 1.02584934, + "balance_loss_mlp": 1.00354612, + "epoch": 0.7449047075091685, + "flos": 26615876094720.0, + "grad_norm": 2.274303491464085, + "language_loss": 0.78806841, + "learning_rate": 6.445020592625083e-07, + "loss": 0.81025815, + "num_input_tokens_seen": 133222175, + "step": 6195, + "time_per_iteration": 2.6589138507843018 + }, + { + "auxiliary_loss_clip": 0.01134444, + "auxiliary_loss_mlp": 0.01083833, + "balance_loss_clip": 1.02512968, + "balance_loss_mlp": 1.00381947, + "epoch": 0.7450249503998077, + "flos": 14170458867840.0, + "grad_norm": 2.14434968948959, + "language_loss": 0.80069947, + "learning_rate": 6.4392938940689e-07, + "loss": 0.82288218, + "num_input_tokens_seen": 133237590, + "step": 6196, + "time_per_iteration": 2.570244312286377 + }, + { + "auxiliary_loss_clip": 0.01097264, + "auxiliary_loss_mlp": 0.00872939, + "balance_loss_clip": 1.02335644, + "balance_loss_mlp": 1.0001204, + "epoch": 0.7451451932904467, + "flos": 19606687752960.0, + "grad_norm": 2.0408915200778797, + "language_loss": 0.71051925, + "learning_rate": 6.433569252661049e-07, + "loss": 0.73022127, + "num_input_tokens_seen": 133255590, + "step": 6197, + "time_per_iteration": 3.65816330909729 + }, + { + "auxiliary_loss_clip": 0.01106463, + "auxiliary_loss_mlp": 0.01084178, + "balance_loss_clip": 1.02317071, + "balance_loss_mlp": 1.00411725, + "epoch": 0.7452654361810858, + "flos": 12495405980160.0, + "grad_norm": 1.7764247359572802, + "language_loss": 0.70922053, + "learning_rate": 6.427846669269952e-07, + "loss": 0.73112696, + "num_input_tokens_seen": 133273210, + "step": 6198, + "time_per_iteration": 2.6962180137634277 + }, + { + "auxiliary_loss_clip": 0.01138209, + "auxiliary_loss_mlp": 0.01084233, + "balance_loss_clip": 1.02846122, + "balance_loss_mlp": 1.00417209, + "epoch": 0.7453856790717249, + "flos": 22127329687680.0, + "grad_norm": 5.661181980491228, + "language_loss": 0.82352763, + "learning_rate": 6.422126144763729e-07, + "loss": 0.845752, + "num_input_tokens_seen": 133292600, + "step": 6199, + "time_per_iteration": 2.6835150718688965 + }, + { + "auxiliary_loss_clip": 0.01106604, + "auxiliary_loss_mlp": 0.00873026, + "balance_loss_clip": 1.02345061, + "balance_loss_mlp": 1.00009751, + "epoch": 0.745505921962364, + "flos": 20010682995840.0, + "grad_norm": 1.9897362593543684, + "language_loss": 0.76633739, + "learning_rate": 6.416407680010174e-07, + "loss": 0.78613365, + "num_input_tokens_seen": 133306960, + "step": 6200, + "time_per_iteration": 2.840588331222534 + }, + { + "auxiliary_loss_clip": 0.01085103, + "auxiliary_loss_mlp": 0.01085639, + "balance_loss_clip": 1.02476346, + "balance_loss_mlp": 1.00548291, + "epoch": 0.745626164853003, + "flos": 24677884673280.0, + "grad_norm": 1.9257342095455976, + "language_loss": 0.81055045, + "learning_rate": 6.410691275876774e-07, + "loss": 0.83225787, + "num_input_tokens_seen": 133326380, + "step": 6201, + "time_per_iteration": 2.814573049545288 + }, + { + "auxiliary_loss_clip": 0.01101852, + "auxiliary_loss_mlp": 0.01084097, + "balance_loss_clip": 1.02550149, + "balance_loss_mlp": 1.00398827, + "epoch": 0.7457464077436422, + "flos": 14538830797440.0, + "grad_norm": 2.2693852158373704, + "language_loss": 0.76789159, + "learning_rate": 6.404976933230704e-07, + "loss": 0.78975105, + "num_input_tokens_seen": 133342900, + "step": 6202, + "time_per_iteration": 2.7462685108184814 + }, + { + "auxiliary_loss_clip": 0.01118229, + "auxiliary_loss_mlp": 0.0108393, + "balance_loss_clip": 1.02477598, + "balance_loss_mlp": 1.00382137, + "epoch": 0.7458666506342813, + "flos": 34021194600960.0, + "grad_norm": 1.8565790682147012, + "language_loss": 0.72057116, + "learning_rate": 6.399264652938813e-07, + "loss": 0.74259281, + "num_input_tokens_seen": 133363805, + "step": 6203, + "time_per_iteration": 2.7888293266296387 + }, + { + "auxiliary_loss_clip": 0.01115563, + "auxiliary_loss_mlp": 0.01083894, + "balance_loss_clip": 1.02392364, + "balance_loss_mlp": 1.00378561, + "epoch": 0.7459868935249203, + "flos": 24279025075200.0, + "grad_norm": 2.197769233859996, + "language_loss": 0.74451935, + "learning_rate": 6.393554435867679e-07, + "loss": 0.76651388, + "num_input_tokens_seen": 133384655, + "step": 6204, + "time_per_iteration": 2.746962547302246 + }, + { + "auxiliary_loss_clip": 0.01108213, + "auxiliary_loss_mlp": 0.01083898, + "balance_loss_clip": 1.02417231, + "balance_loss_mlp": 1.00369406, + "epoch": 0.7461071364155595, + "flos": 21908777385600.0, + "grad_norm": 2.272892450667668, + "language_loss": 0.83579385, + "learning_rate": 6.387846282883502e-07, + "loss": 0.85771501, + "num_input_tokens_seen": 133401185, + "step": 6205, + "time_per_iteration": 2.8032877445220947 + }, + { + "auxiliary_loss_clip": 0.01134384, + "auxiliary_loss_mlp": 0.01084824, + "balance_loss_clip": 1.02525187, + "balance_loss_mlp": 1.00466728, + "epoch": 0.7462273793061985, + "flos": 22889712879360.0, + "grad_norm": 1.9814221293365, + "language_loss": 0.77105927, + "learning_rate": 6.38214019485223e-07, + "loss": 0.7932514, + "num_input_tokens_seen": 133420010, + "step": 6206, + "time_per_iteration": 2.6196374893188477 + }, + { + "auxiliary_loss_clip": 0.01087102, + "auxiliary_loss_mlp": 0.01083949, + "balance_loss_clip": 1.02089357, + "balance_loss_mlp": 1.00374484, + "epoch": 0.7463476221968376, + "flos": 19968451580160.0, + "grad_norm": 3.5517452569577834, + "language_loss": 0.71461654, + "learning_rate": 6.376436172639461e-07, + "loss": 0.73632705, + "num_input_tokens_seen": 133437855, + "step": 6207, + "time_per_iteration": 2.878828287124634 + }, + { + "auxiliary_loss_clip": 0.0106011, + "auxiliary_loss_mlp": 0.01084018, + "balance_loss_clip": 1.01963282, + "balance_loss_mlp": 1.0038141, + "epoch": 0.7464678650874768, + "flos": 16836610798080.0, + "grad_norm": 2.6306446267915127, + "language_loss": 0.65288597, + "learning_rate": 6.370734217110487e-07, + "loss": 0.67432725, + "num_input_tokens_seen": 133456600, + "step": 6208, + "time_per_iteration": 2.8326056003570557 + }, + { + "auxiliary_loss_clip": 0.01116535, + "auxiliary_loss_mlp": 0.01085775, + "balance_loss_clip": 1.02478504, + "balance_loss_mlp": 1.00557077, + "epoch": 0.7465881079781158, + "flos": 48100869843840.0, + "grad_norm": 1.3823419921661435, + "language_loss": 0.64184576, + "learning_rate": 6.36503432913031e-07, + "loss": 0.66386884, + "num_input_tokens_seen": 133479745, + "step": 6209, + "time_per_iteration": 2.948347806930542 + }, + { + "auxiliary_loss_clip": 0.01119299, + "auxiliary_loss_mlp": 0.01083757, + "balance_loss_clip": 1.02474046, + "balance_loss_mlp": 1.00355315, + "epoch": 0.7467083508687549, + "flos": 19677359761920.0, + "grad_norm": 2.113266514523826, + "language_loss": 0.6910584, + "learning_rate": 6.359336509563569e-07, + "loss": 0.71308899, + "num_input_tokens_seen": 133495765, + "step": 6210, + "time_per_iteration": 2.6304547786712646 + }, + { + "auxiliary_loss_clip": 0.01100809, + "auxiliary_loss_mlp": 0.01084106, + "balance_loss_clip": 1.02386177, + "balance_loss_mlp": 1.00399721, + "epoch": 0.7468285937593939, + "flos": 17895436934400.0, + "grad_norm": 2.126221749138349, + "language_loss": 0.80772305, + "learning_rate": 6.353640759274641e-07, + "loss": 0.8295722, + "num_input_tokens_seen": 133514655, + "step": 6211, + "time_per_iteration": 2.7979257106781006 + }, + { + "auxiliary_loss_clip": 0.01125954, + "auxiliary_loss_mlp": 0.01084778, + "balance_loss_clip": 1.02455735, + "balance_loss_mlp": 1.00457382, + "epoch": 0.7469488366500331, + "flos": 23141446369920.0, + "grad_norm": 2.6964444899399345, + "language_loss": 0.74106175, + "learning_rate": 6.347947079127556e-07, + "loss": 0.76316911, + "num_input_tokens_seen": 133532555, + "step": 6212, + "time_per_iteration": 2.6452198028564453 + }, + { + "auxiliary_loss_clip": 0.01115804, + "auxiliary_loss_mlp": 0.01085364, + "balance_loss_clip": 1.02359581, + "balance_loss_mlp": 1.00525546, + "epoch": 0.7470690795406721, + "flos": 16690849407360.0, + "grad_norm": 2.0043905493271494, + "language_loss": 0.76811266, + "learning_rate": 6.342255469986053e-07, + "loss": 0.7901243, + "num_input_tokens_seen": 133551300, + "step": 6213, + "time_per_iteration": 2.6946322917938232 + }, + { + "auxiliary_loss_clip": 0.01136029, + "auxiliary_loss_mlp": 0.01083252, + "balance_loss_clip": 1.02655649, + "balance_loss_mlp": 1.00309539, + "epoch": 0.7471893224313112, + "flos": 25192700352000.0, + "grad_norm": 2.119335588452047, + "language_loss": 0.75953758, + "learning_rate": 6.336565932713533e-07, + "loss": 0.78173035, + "num_input_tokens_seen": 133570725, + "step": 6214, + "time_per_iteration": 2.6138572692871094 + }, + { + "auxiliary_loss_clip": 0.01110308, + "auxiliary_loss_mlp": 0.01084695, + "balance_loss_clip": 1.02116442, + "balance_loss_mlp": 1.00458694, + "epoch": 0.7473095653219504, + "flos": 22526225199360.0, + "grad_norm": 1.7187329780221605, + "language_loss": 0.77815223, + "learning_rate": 6.330878468173088e-07, + "loss": 0.80010223, + "num_input_tokens_seen": 133590790, + "step": 6215, + "time_per_iteration": 3.4656670093536377 + }, + { + "auxiliary_loss_clip": 0.01126637, + "auxiliary_loss_mlp": 0.01083, + "balance_loss_clip": 1.02529407, + "balance_loss_mlp": 1.00284386, + "epoch": 0.7474298082125894, + "flos": 18113989236480.0, + "grad_norm": 1.5571360856450058, + "language_loss": 0.72656548, + "learning_rate": 6.32519307722752e-07, + "loss": 0.74866188, + "num_input_tokens_seen": 133608685, + "step": 6216, + "time_per_iteration": 2.5093142986297607 + }, + { + "auxiliary_loss_clip": 0.01083395, + "auxiliary_loss_mlp": 0.01079112, + "balance_loss_clip": 1.01278377, + "balance_loss_mlp": 1.00014806, + "epoch": 0.7475500511032285, + "flos": 62086535193600.0, + "grad_norm": 0.8731912531745684, + "language_loss": 0.54968768, + "learning_rate": 6.31950976073929e-07, + "loss": 0.57131284, + "num_input_tokens_seen": 133662775, + "step": 6217, + "time_per_iteration": 3.1822562217712402 + }, + { + "auxiliary_loss_clip": 0.01092838, + "auxiliary_loss_mlp": 0.01084134, + "balance_loss_clip": 1.01987457, + "balance_loss_mlp": 1.00397801, + "epoch": 0.7476702939938676, + "flos": 17785586165760.0, + "grad_norm": 2.636500026102471, + "language_loss": 0.80749816, + "learning_rate": 6.31382851957055e-07, + "loss": 0.82926786, + "num_input_tokens_seen": 133679595, + "step": 6218, + "time_per_iteration": 3.589118003845215 + }, + { + "auxiliary_loss_clip": 0.01108535, + "auxiliary_loss_mlp": 0.00872917, + "balance_loss_clip": 1.02464008, + "balance_loss_mlp": 1.00010109, + "epoch": 0.7477905368845067, + "flos": 27927944092800.0, + "grad_norm": 2.0693126864228644, + "language_loss": 0.71645236, + "learning_rate": 6.308149354583143e-07, + "loss": 0.73626691, + "num_input_tokens_seen": 133699000, + "step": 6219, + "time_per_iteration": 3.6971275806427 + }, + { + "auxiliary_loss_clip": 0.01126351, + "auxiliary_loss_mlp": 0.01083861, + "balance_loss_clip": 1.02503633, + "balance_loss_mlp": 1.00370502, + "epoch": 0.7479107797751458, + "flos": 26870374932480.0, + "grad_norm": 1.7118932312810085, + "language_loss": 0.81727338, + "learning_rate": 6.302472266638586e-07, + "loss": 0.8393755, + "num_input_tokens_seen": 133719540, + "step": 6220, + "time_per_iteration": 2.647599697113037 + }, + { + "auxiliary_loss_clip": 0.0113711, + "auxiliary_loss_mlp": 0.01084713, + "balance_loss_clip": 1.02695155, + "balance_loss_mlp": 1.00441396, + "epoch": 0.7480310226657849, + "flos": 33943375785600.0, + "grad_norm": 2.0014334054703773, + "language_loss": 0.70270932, + "learning_rate": 6.296797256598101e-07, + "loss": 0.72492754, + "num_input_tokens_seen": 133741020, + "step": 6221, + "time_per_iteration": 2.816800594329834 + }, + { + "auxiliary_loss_clip": 0.01107097, + "auxiliary_loss_mlp": 0.01084712, + "balance_loss_clip": 1.02352452, + "balance_loss_mlp": 1.00465083, + "epoch": 0.748151265556424, + "flos": 24826555065600.0, + "grad_norm": 1.6537862025853296, + "language_loss": 0.81217778, + "learning_rate": 6.291124325322576e-07, + "loss": 0.8340959, + "num_input_tokens_seen": 133761145, + "step": 6222, + "time_per_iteration": 3.693723201751709 + }, + { + "auxiliary_loss_clip": 0.01118889, + "auxiliary_loss_mlp": 0.01083995, + "balance_loss_clip": 1.02611184, + "balance_loss_mlp": 1.00383902, + "epoch": 0.748271508447063, + "flos": 38399351535360.0, + "grad_norm": 1.7673200447274704, + "language_loss": 0.62403107, + "learning_rate": 6.285453473672595e-07, + "loss": 0.64605993, + "num_input_tokens_seen": 133783715, + "step": 6223, + "time_per_iteration": 2.7602365016937256 + }, + { + "auxiliary_loss_clip": 0.01134615, + "auxiliary_loss_mlp": 0.01084265, + "balance_loss_clip": 1.02512872, + "balance_loss_mlp": 1.00415647, + "epoch": 0.7483917513377022, + "flos": 21541842000000.0, + "grad_norm": 1.8388960596512052, + "language_loss": 0.75361657, + "learning_rate": 6.279784702508415e-07, + "loss": 0.77580535, + "num_input_tokens_seen": 133804465, + "step": 6224, + "time_per_iteration": 2.5608012676239014 + }, + { + "auxiliary_loss_clip": 0.01075348, + "auxiliary_loss_mlp": 0.010795, + "balance_loss_clip": 1.02096581, + "balance_loss_mlp": 1.00053561, + "epoch": 0.7485119942283412, + "flos": 62314532772480.0, + "grad_norm": 0.8093755011604745, + "language_loss": 0.58603442, + "learning_rate": 6.274118012689979e-07, + "loss": 0.60758281, + "num_input_tokens_seen": 133866365, + "step": 6225, + "time_per_iteration": 3.41001296043396 + }, + { + "auxiliary_loss_clip": 0.0111672, + "auxiliary_loss_mlp": 0.010843, + "balance_loss_clip": 1.02426887, + "balance_loss_mlp": 1.00414395, + "epoch": 0.7486322371189803, + "flos": 29937613104000.0, + "grad_norm": 1.5554750539512983, + "language_loss": 0.68195581, + "learning_rate": 6.268453405076943e-07, + "loss": 0.70396602, + "num_input_tokens_seen": 133888760, + "step": 6226, + "time_per_iteration": 2.8493592739105225 + }, + { + "auxiliary_loss_clip": 0.01115708, + "auxiliary_loss_mlp": 0.0108442, + "balance_loss_clip": 1.02345347, + "balance_loss_mlp": 1.00431156, + "epoch": 0.7487524800096195, + "flos": 18949414734720.0, + "grad_norm": 1.8492436747618934, + "language_loss": 0.8228693, + "learning_rate": 6.262790880528592e-07, + "loss": 0.84487057, + "num_input_tokens_seen": 133906380, + "step": 6227, + "time_per_iteration": 2.6875927448272705 + }, + { + "auxiliary_loss_clip": 0.01108982, + "auxiliary_loss_mlp": 0.01084498, + "balance_loss_clip": 1.02361035, + "balance_loss_mlp": 1.00419879, + "epoch": 0.7488727229002585, + "flos": 18697393935360.0, + "grad_norm": 2.5802847967144236, + "language_loss": 0.79678226, + "learning_rate": 6.257130439903951e-07, + "loss": 0.81871706, + "num_input_tokens_seen": 133922875, + "step": 6228, + "time_per_iteration": 2.7675325870513916 + }, + { + "auxiliary_loss_clip": 0.0113622, + "auxiliary_loss_mlp": 0.01084514, + "balance_loss_clip": 1.02641296, + "balance_loss_mlp": 1.00435781, + "epoch": 0.7489929657908976, + "flos": 23623368168960.0, + "grad_norm": 1.7273999554483828, + "language_loss": 0.81315899, + "learning_rate": 6.251472084061695e-07, + "loss": 0.83536637, + "num_input_tokens_seen": 133941795, + "step": 6229, + "time_per_iteration": 2.7097411155700684 + }, + { + "auxiliary_loss_clip": 0.01124728, + "auxiliary_loss_mlp": 0.01083667, + "balance_loss_clip": 1.02479887, + "balance_loss_mlp": 1.00355864, + "epoch": 0.7491132086815367, + "flos": 20551533056640.0, + "grad_norm": 1.875180917631816, + "language_loss": 0.88750362, + "learning_rate": 6.245815813860191e-07, + "loss": 0.90958756, + "num_input_tokens_seen": 133957305, + "step": 6230, + "time_per_iteration": 2.6587631702423096 + }, + { + "auxiliary_loss_clip": 0.01134789, + "auxiliary_loss_mlp": 0.01084661, + "balance_loss_clip": 1.02510905, + "balance_loss_mlp": 1.00436127, + "epoch": 0.7492334515721758, + "flos": 23003011353600.0, + "grad_norm": 1.950494971772587, + "language_loss": 0.7046929, + "learning_rate": 6.240161630157495e-07, + "loss": 0.72688746, + "num_input_tokens_seen": 133976660, + "step": 6231, + "time_per_iteration": 2.642486572265625 + }, + { + "auxiliary_loss_clip": 0.01135071, + "auxiliary_loss_mlp": 0.01084173, + "balance_loss_clip": 1.02558208, + "balance_loss_mlp": 1.00392151, + "epoch": 0.7493536944628149, + "flos": 16398823835520.0, + "grad_norm": 2.133859089257672, + "language_loss": 0.70305079, + "learning_rate": 6.23450953381133e-07, + "loss": 0.72524321, + "num_input_tokens_seen": 133994750, + "step": 6232, + "time_per_iteration": 2.700978994369507 + }, + { + "auxiliary_loss_clip": 0.01116995, + "auxiliary_loss_mlp": 0.01083187, + "balance_loss_clip": 1.02398145, + "balance_loss_mlp": 1.0030781, + "epoch": 0.749473937353454, + "flos": 15338561155200.0, + "grad_norm": 1.85475942471203, + "language_loss": 0.68196487, + "learning_rate": 6.228859525679131e-07, + "loss": 0.70396674, + "num_input_tokens_seen": 134009165, + "step": 6233, + "time_per_iteration": 2.7279253005981445 + }, + { + "auxiliary_loss_clip": 0.01126018, + "auxiliary_loss_mlp": 0.01083951, + "balance_loss_clip": 1.02537489, + "balance_loss_mlp": 1.00384271, + "epoch": 0.7495941802440931, + "flos": 18951138587520.0, + "grad_norm": 2.7194400741513607, + "language_loss": 0.7978543, + "learning_rate": 6.223211606617986e-07, + "loss": 0.81995404, + "num_input_tokens_seen": 134027585, + "step": 6234, + "time_per_iteration": 2.677659749984741 + }, + { + "auxiliary_loss_clip": 0.01126774, + "auxiliary_loss_mlp": 0.01084841, + "balance_loss_clip": 1.02682829, + "balance_loss_mlp": 1.00501835, + "epoch": 0.7497144231347321, + "flos": 22492469393280.0, + "grad_norm": 1.6660438150548451, + "language_loss": 0.8409642, + "learning_rate": 6.217565777484701e-07, + "loss": 0.86308038, + "num_input_tokens_seen": 134046680, + "step": 6235, + "time_per_iteration": 2.7183279991149902 + }, + { + "auxiliary_loss_clip": 0.01115275, + "auxiliary_loss_mlp": 0.00872928, + "balance_loss_clip": 1.02440405, + "balance_loss_mlp": 1.00012851, + "epoch": 0.7498346660253713, + "flos": 24243509502720.0, + "grad_norm": 1.7320997174879054, + "language_loss": 0.8010481, + "learning_rate": 6.211922039135722e-07, + "loss": 0.82093012, + "num_input_tokens_seen": 134066825, + "step": 6236, + "time_per_iteration": 2.7694132328033447 + }, + { + "auxiliary_loss_clip": 0.01135982, + "auxiliary_loss_mlp": 0.01084278, + "balance_loss_clip": 1.02665806, + "balance_loss_mlp": 1.00407434, + "epoch": 0.7499549089160104, + "flos": 24387080163840.0, + "grad_norm": 1.7111448939610798, + "language_loss": 0.80978632, + "learning_rate": 6.206280392427201e-07, + "loss": 0.83198893, + "num_input_tokens_seen": 134086410, + "step": 6237, + "time_per_iteration": 2.6982784271240234 + }, + { + "auxiliary_loss_clip": 0.01126384, + "auxiliary_loss_mlp": 0.01084461, + "balance_loss_clip": 1.02538097, + "balance_loss_mlp": 1.00430417, + "epoch": 0.7500751518066494, + "flos": 34057320704640.0, + "grad_norm": 1.5457901903426972, + "language_loss": 0.73743248, + "learning_rate": 6.200640838214983e-07, + "loss": 0.75954092, + "num_input_tokens_seen": 134109185, + "step": 6238, + "time_per_iteration": 2.7868738174438477 + }, + { + "auxiliary_loss_clip": 0.01134453, + "auxiliary_loss_mlp": 0.01084271, + "balance_loss_clip": 1.02509761, + "balance_loss_mlp": 1.00416255, + "epoch": 0.7501953946972886, + "flos": 18843586289280.0, + "grad_norm": 1.9669781269879612, + "language_loss": 0.66281068, + "learning_rate": 6.195003377354578e-07, + "loss": 0.68499792, + "num_input_tokens_seen": 134128455, + "step": 6239, + "time_per_iteration": 2.677006244659424 + }, + { + "auxiliary_loss_clip": 0.01125896, + "auxiliary_loss_mlp": 0.01084528, + "balance_loss_clip": 1.024212, + "balance_loss_mlp": 1.00432456, + "epoch": 0.7503156375879276, + "flos": 20257675891200.0, + "grad_norm": 5.383453949249106, + "language_loss": 0.73271632, + "learning_rate": 6.189368010701183e-07, + "loss": 0.75482059, + "num_input_tokens_seen": 134145515, + "step": 6240, + "time_per_iteration": 2.678400993347168 + }, + { + "auxiliary_loss_clip": 0.01126778, + "auxiliary_loss_mlp": 0.01083688, + "balance_loss_clip": 1.02456522, + "balance_loss_mlp": 1.00353146, + "epoch": 0.7504358804785667, + "flos": 13480040574720.0, + "grad_norm": 1.874602706717411, + "language_loss": 0.76564574, + "learning_rate": 6.183734739109683e-07, + "loss": 0.78775036, + "num_input_tokens_seen": 134163335, + "step": 6241, + "time_per_iteration": 3.5316171646118164 + }, + { + "auxiliary_loss_clip": 0.01110312, + "auxiliary_loss_mlp": 0.0108368, + "balance_loss_clip": 1.02591276, + "balance_loss_mlp": 1.00342882, + "epoch": 0.7505561233692057, + "flos": 29461042431360.0, + "grad_norm": 1.94350490551067, + "language_loss": 0.68819535, + "learning_rate": 6.178103563434629e-07, + "loss": 0.71013528, + "num_input_tokens_seen": 134182335, + "step": 6242, + "time_per_iteration": 2.720755100250244 + }, + { + "auxiliary_loss_clip": 0.0113483, + "auxiliary_loss_mlp": 0.01084341, + "balance_loss_clip": 1.02512622, + "balance_loss_mlp": 1.00418472, + "epoch": 0.7506763662598449, + "flos": 20302457172480.0, + "grad_norm": 7.701500786499225, + "language_loss": 0.8396728, + "learning_rate": 6.172474484530283e-07, + "loss": 0.86186445, + "num_input_tokens_seen": 134201070, + "step": 6243, + "time_per_iteration": 3.629344940185547 + }, + { + "auxiliary_loss_clip": 0.01117804, + "auxiliary_loss_mlp": 0.01083601, + "balance_loss_clip": 1.02467287, + "balance_loss_mlp": 1.00349283, + "epoch": 0.750796609150484, + "flos": 37230961939200.0, + "grad_norm": 1.6746412564079995, + "language_loss": 0.76045573, + "learning_rate": 6.166847503250563e-07, + "loss": 0.78246975, + "num_input_tokens_seen": 134223310, + "step": 6244, + "time_per_iteration": 3.7641923427581787 + }, + { + "auxiliary_loss_clip": 0.01115472, + "auxiliary_loss_mlp": 0.01083172, + "balance_loss_clip": 1.02405143, + "balance_loss_mlp": 1.00311136, + "epoch": 0.750916852041123, + "flos": 19609417186560.0, + "grad_norm": 2.3964701829013935, + "language_loss": 0.78631157, + "learning_rate": 6.161222620449078e-07, + "loss": 0.80829799, + "num_input_tokens_seen": 134242085, + "step": 6245, + "time_per_iteration": 2.7575066089630127 + }, + { + "auxiliary_loss_clip": 0.01105958, + "auxiliary_loss_mlp": 0.01084311, + "balance_loss_clip": 1.02322292, + "balance_loss_mlp": 1.00415468, + "epoch": 0.7510370949317622, + "flos": 25112690807040.0, + "grad_norm": 2.219873931732747, + "language_loss": 0.79689598, + "learning_rate": 6.155599836979117e-07, + "loss": 0.81879866, + "num_input_tokens_seen": 134260770, + "step": 6246, + "time_per_iteration": 2.7739572525024414 + }, + { + "auxiliary_loss_clip": 0.01095862, + "auxiliary_loss_mlp": 0.01083388, + "balance_loss_clip": 1.02164054, + "balance_loss_mlp": 1.00323212, + "epoch": 0.7511573378224012, + "flos": 19062282245760.0, + "grad_norm": 2.137989346026319, + "language_loss": 0.81452644, + "learning_rate": 6.149979153693649e-07, + "loss": 0.83631891, + "num_input_tokens_seen": 134278025, + "step": 6247, + "time_per_iteration": 3.725111961364746 + }, + { + "auxiliary_loss_clip": 0.01128026, + "auxiliary_loss_mlp": 0.0108368, + "balance_loss_clip": 1.02637422, + "balance_loss_mlp": 1.00357175, + "epoch": 0.7512775807130403, + "flos": 19937676602880.0, + "grad_norm": 2.6605741502226428, + "language_loss": 0.77058387, + "learning_rate": 6.144360571445343e-07, + "loss": 0.79270095, + "num_input_tokens_seen": 134297170, + "step": 6248, + "time_per_iteration": 2.652806282043457 + }, + { + "auxiliary_loss_clip": 0.01124618, + "auxiliary_loss_mlp": 0.0108521, + "balance_loss_clip": 1.02410507, + "balance_loss_mlp": 1.00495815, + "epoch": 0.7513978236036795, + "flos": 20739920912640.0, + "grad_norm": 1.7453324402467905, + "language_loss": 0.80087787, + "learning_rate": 6.138744091086509e-07, + "loss": 0.82297617, + "num_input_tokens_seen": 134316755, + "step": 6249, + "time_per_iteration": 2.660233736038208 + }, + { + "auxiliary_loss_clip": 0.01106874, + "auxiliary_loss_mlp": 0.01084669, + "balance_loss_clip": 1.02407205, + "balance_loss_mlp": 1.0046562, + "epoch": 0.7515180664943185, + "flos": 27563163523200.0, + "grad_norm": 2.3144969719196826, + "language_loss": 0.72530073, + "learning_rate": 6.133129713469183e-07, + "loss": 0.74721611, + "num_input_tokens_seen": 134335960, + "step": 6250, + "time_per_iteration": 2.816673755645752 + }, + { + "auxiliary_loss_clip": 0.01109171, + "auxiliary_loss_mlp": 0.01084178, + "balance_loss_clip": 1.02385008, + "balance_loss_mlp": 1.004022, + "epoch": 0.7516383093849576, + "flos": 33803181002880.0, + "grad_norm": 2.1767348751560314, + "language_loss": 0.63541692, + "learning_rate": 6.127517439445053e-07, + "loss": 0.65735042, + "num_input_tokens_seen": 134356805, + "step": 6251, + "time_per_iteration": 2.8612639904022217 + }, + { + "auxiliary_loss_clip": 0.01082226, + "auxiliary_loss_mlp": 0.01084029, + "balance_loss_clip": 1.02444005, + "balance_loss_mlp": 1.00406289, + "epoch": 0.7517585522755967, + "flos": 29746172592000.0, + "grad_norm": 1.8677693539712175, + "language_loss": 0.81748056, + "learning_rate": 6.121907269865498e-07, + "loss": 0.8391431, + "num_input_tokens_seen": 134376295, + "step": 6252, + "time_per_iteration": 2.7866058349609375 + }, + { + "auxiliary_loss_clip": 0.01090017, + "auxiliary_loss_mlp": 0.01078608, + "balance_loss_clip": 1.01976275, + "balance_loss_mlp": 0.99964374, + "epoch": 0.7518787951662358, + "flos": 69807974319360.0, + "grad_norm": 0.9218326057874545, + "language_loss": 0.67238092, + "learning_rate": 6.116299205581577e-07, + "loss": 0.69406712, + "num_input_tokens_seen": 134431125, + "step": 6253, + "time_per_iteration": 3.3014910221099854 + }, + { + "auxiliary_loss_clip": 0.01137773, + "auxiliary_loss_mlp": 0.01084245, + "balance_loss_clip": 1.02761865, + "balance_loss_mlp": 1.00399303, + "epoch": 0.7519990380568748, + "flos": 34203225749760.0, + "grad_norm": 1.8286259963045521, + "language_loss": 0.68302262, + "learning_rate": 6.110693247444018e-07, + "loss": 0.70524275, + "num_input_tokens_seen": 134452960, + "step": 6254, + "time_per_iteration": 2.7469427585601807 + }, + { + "auxiliary_loss_clip": 0.01102236, + "auxiliary_loss_mlp": 0.01083968, + "balance_loss_clip": 1.0240798, + "balance_loss_mlp": 1.00400233, + "epoch": 0.752119280947514, + "flos": 21725704742400.0, + "grad_norm": 2.437030114623037, + "language_loss": 0.82436979, + "learning_rate": 6.105089396303258e-07, + "loss": 0.84623194, + "num_input_tokens_seen": 134471350, + "step": 6255, + "time_per_iteration": 2.765850782394409 + }, + { + "auxiliary_loss_clip": 0.01115828, + "auxiliary_loss_mlp": 0.01085124, + "balance_loss_clip": 1.02380991, + "balance_loss_mlp": 1.00487244, + "epoch": 0.7522395238381531, + "flos": 32742774668160.0, + "grad_norm": 1.8528317435689456, + "language_loss": 0.7549392, + "learning_rate": 6.099487653009383e-07, + "loss": 0.77694869, + "num_input_tokens_seen": 134490695, + "step": 6256, + "time_per_iteration": 2.748183250427246 + }, + { + "auxiliary_loss_clip": 0.0112686, + "auxiliary_loss_mlp": 0.01083951, + "balance_loss_clip": 1.02578282, + "balance_loss_mlp": 1.00403261, + "epoch": 0.7523597667287921, + "flos": 23476026579840.0, + "grad_norm": 1.8463376873510633, + "language_loss": 0.82974291, + "learning_rate": 6.093888018412192e-07, + "loss": 0.85185105, + "num_input_tokens_seen": 134506885, + "step": 6257, + "time_per_iteration": 2.687788486480713 + }, + { + "auxiliary_loss_clip": 0.01108239, + "auxiliary_loss_mlp": 0.01078892, + "balance_loss_clip": 1.02073932, + "balance_loss_mlp": 0.99992752, + "epoch": 0.7524800096194313, + "flos": 67346730501120.0, + "grad_norm": 0.7124078499550526, + "language_loss": 0.54730159, + "learning_rate": 6.088290493361125e-07, + "loss": 0.56917292, + "num_input_tokens_seen": 134571770, + "step": 6258, + "time_per_iteration": 3.359226942062378 + }, + { + "auxiliary_loss_clip": 0.01093577, + "auxiliary_loss_mlp": 0.01083199, + "balance_loss_clip": 1.02075076, + "balance_loss_mlp": 1.00309062, + "epoch": 0.7526002525100703, + "flos": 13006055681280.0, + "grad_norm": 2.899399734625945, + "language_loss": 0.71396017, + "learning_rate": 6.082695078705322e-07, + "loss": 0.73572791, + "num_input_tokens_seen": 134589250, + "step": 6259, + "time_per_iteration": 2.7928223609924316 + }, + { + "auxiliary_loss_clip": 0.01126349, + "auxiliary_loss_mlp": 0.01085182, + "balance_loss_clip": 1.02487195, + "balance_loss_mlp": 1.00488305, + "epoch": 0.7527204954007094, + "flos": 21397229844480.0, + "grad_norm": 2.629738216247581, + "language_loss": 0.68870515, + "learning_rate": 6.077101775293618e-07, + "loss": 0.71082044, + "num_input_tokens_seen": 134608075, + "step": 6260, + "time_per_iteration": 2.639035224914551 + }, + { + "auxiliary_loss_clip": 0.01125952, + "auxiliary_loss_mlp": 0.01084493, + "balance_loss_clip": 1.02549219, + "balance_loss_mlp": 1.00419366, + "epoch": 0.7528407382913486, + "flos": 18947188091520.0, + "grad_norm": 2.1918890150442554, + "language_loss": 0.82102466, + "learning_rate": 6.071510583974504e-07, + "loss": 0.84312916, + "num_input_tokens_seen": 134623260, + "step": 6261, + "time_per_iteration": 2.657106876373291 + }, + { + "auxiliary_loss_clip": 0.01134381, + "auxiliary_loss_mlp": 0.01083742, + "balance_loss_clip": 1.025172, + "balance_loss_mlp": 1.00358558, + "epoch": 0.7529609811819876, + "flos": 15231798956160.0, + "grad_norm": 1.9557856575831547, + "language_loss": 0.71947742, + "learning_rate": 6.065921505596161e-07, + "loss": 0.74165863, + "num_input_tokens_seen": 134641540, + "step": 6262, + "time_per_iteration": 2.589909076690674 + }, + { + "auxiliary_loss_clip": 0.01106479, + "auxiliary_loss_mlp": 0.01084152, + "balance_loss_clip": 1.0240773, + "balance_loss_mlp": 1.00394809, + "epoch": 0.7530812240726267, + "flos": 19354487385600.0, + "grad_norm": 1.6948337895121481, + "language_loss": 0.77074379, + "learning_rate": 6.060334541006445e-07, + "loss": 0.7926501, + "num_input_tokens_seen": 134660035, + "step": 6263, + "time_per_iteration": 2.8324081897735596 + }, + { + "auxiliary_loss_clip": 0.01108219, + "auxiliary_loss_mlp": 0.01084303, + "balance_loss_clip": 1.02337813, + "balance_loss_mlp": 1.00414634, + "epoch": 0.7532014669632658, + "flos": 27748247328000.0, + "grad_norm": 1.4718920155066344, + "language_loss": 0.68779385, + "learning_rate": 6.05474969105289e-07, + "loss": 0.70971906, + "num_input_tokens_seen": 134683025, + "step": 6264, + "time_per_iteration": 2.8398501873016357 + }, + { + "auxiliary_loss_clip": 0.01125486, + "auxiliary_loss_mlp": 0.01084181, + "balance_loss_clip": 1.02554727, + "balance_loss_mlp": 1.00392902, + "epoch": 0.7533217098539049, + "flos": 14137421333760.0, + "grad_norm": 1.8952557197484476, + "language_loss": 0.73772454, + "learning_rate": 6.049166956582725e-07, + "loss": 0.75982118, + "num_input_tokens_seen": 134701290, + "step": 6265, + "time_per_iteration": 2.6751441955566406 + }, + { + "auxiliary_loss_clip": 0.01127181, + "auxiliary_loss_mlp": 0.01083334, + "balance_loss_clip": 1.02648282, + "balance_loss_mlp": 1.00322545, + "epoch": 0.753441952744544, + "flos": 26429068437120.0, + "grad_norm": 2.0626161169366286, + "language_loss": 0.87177914, + "learning_rate": 6.043586338442841e-07, + "loss": 0.8938843, + "num_input_tokens_seen": 134720345, + "step": 6266, + "time_per_iteration": 3.5279364585876465 + }, + { + "auxiliary_loss_clip": 0.01135713, + "auxiliary_loss_mlp": 0.01083782, + "balance_loss_clip": 1.02680004, + "balance_loss_mlp": 1.00376892, + "epoch": 0.7535621956351831, + "flos": 23878621192320.0, + "grad_norm": 4.179068251216408, + "language_loss": 0.7288658, + "learning_rate": 6.038007837479815e-07, + "loss": 0.75106072, + "num_input_tokens_seen": 134741450, + "step": 6267, + "time_per_iteration": 2.653477668762207 + }, + { + "auxiliary_loss_clip": 0.0112422, + "auxiliary_loss_mlp": 0.01083326, + "balance_loss_clip": 1.0241909, + "balance_loss_mlp": 1.00331306, + "epoch": 0.7536824385258222, + "flos": 21795873960960.0, + "grad_norm": 1.8575906942518867, + "language_loss": 0.63880658, + "learning_rate": 6.032431454539897e-07, + "loss": 0.660882, + "num_input_tokens_seen": 134760295, + "step": 6268, + "time_per_iteration": 2.6578028202056885 + }, + { + "auxiliary_loss_clip": 0.01108597, + "auxiliary_loss_mlp": 0.01083193, + "balance_loss_clip": 1.02590191, + "balance_loss_mlp": 1.0030843, + "epoch": 0.7538026814164612, + "flos": 28911644933760.0, + "grad_norm": 1.8074697701029832, + "language_loss": 0.81259614, + "learning_rate": 6.026857190469014e-07, + "loss": 0.83451402, + "num_input_tokens_seen": 134782050, + "step": 6269, + "time_per_iteration": 3.825428009033203 + }, + { + "auxiliary_loss_clip": 0.01116636, + "auxiliary_loss_mlp": 0.0108463, + "balance_loss_clip": 1.02484846, + "balance_loss_mlp": 1.00447321, + "epoch": 0.7539229243071004, + "flos": 21104701482240.0, + "grad_norm": 2.3128075732241475, + "language_loss": 0.74234426, + "learning_rate": 6.0212850461128e-07, + "loss": 0.76435685, + "num_input_tokens_seen": 134801170, + "step": 6270, + "time_per_iteration": 3.6177895069122314 + }, + { + "auxiliary_loss_clip": 0.01114743, + "auxiliary_loss_mlp": 0.01082483, + "balance_loss_clip": 1.02328348, + "balance_loss_mlp": 1.00232649, + "epoch": 0.7540431671977395, + "flos": 15158469340800.0, + "grad_norm": 2.075818773990303, + "language_loss": 0.7445305, + "learning_rate": 6.015715022316516e-07, + "loss": 0.76650274, + "num_input_tokens_seen": 134819150, + "step": 6271, + "time_per_iteration": 2.6949329376220703 + }, + { + "auxiliary_loss_clip": 0.01094538, + "auxiliary_loss_mlp": 0.01083014, + "balance_loss_clip": 1.02351284, + "balance_loss_mlp": 1.00280976, + "epoch": 0.7541634100883785, + "flos": 18770579896320.0, + "grad_norm": 2.427165493900107, + "language_loss": 0.77959275, + "learning_rate": 6.010147119925154e-07, + "loss": 0.80136824, + "num_input_tokens_seen": 134836905, + "step": 6272, + "time_per_iteration": 2.758202314376831 + }, + { + "auxiliary_loss_clip": 0.01105882, + "auxiliary_loss_mlp": 0.01083995, + "balance_loss_clip": 1.02339804, + "balance_loss_mlp": 1.00383902, + "epoch": 0.7542836529790176, + "flos": 20594770053120.0, + "grad_norm": 1.985193426948077, + "language_loss": 0.6606887, + "learning_rate": 6.004581339783348e-07, + "loss": 0.68258744, + "num_input_tokens_seen": 134855225, + "step": 6273, + "time_per_iteration": 3.760223150253296 + }, + { + "auxiliary_loss_clip": 0.01125557, + "auxiliary_loss_mlp": 0.01084737, + "balance_loss_clip": 1.02470279, + "balance_loss_mlp": 1.00448537, + "epoch": 0.7544038958696567, + "flos": 19095104298240.0, + "grad_norm": 3.163829558112841, + "language_loss": 0.68776387, + "learning_rate": 5.999017682735425e-07, + "loss": 0.70986676, + "num_input_tokens_seen": 134871615, + "step": 6274, + "time_per_iteration": 2.660844087600708 + }, + { + "auxiliary_loss_clip": 0.01088425, + "auxiliary_loss_mlp": 0.01084434, + "balance_loss_clip": 1.02199781, + "balance_loss_mlp": 1.00422978, + "epoch": 0.7545241387602958, + "flos": 31723306859520.0, + "grad_norm": 1.7206589296503776, + "language_loss": 0.66147411, + "learning_rate": 5.993456149625387e-07, + "loss": 0.68320274, + "num_input_tokens_seen": 134892765, + "step": 6275, + "time_per_iteration": 2.9102530479431152 + }, + { + "auxiliary_loss_clip": 0.0110472, + "auxiliary_loss_mlp": 0.01084774, + "balance_loss_clip": 1.02199936, + "balance_loss_mlp": 1.00471306, + "epoch": 0.7546443816509348, + "flos": 20296495514880.0, + "grad_norm": 1.6508450944654205, + "language_loss": 0.82092512, + "learning_rate": 5.987896741296909e-07, + "loss": 0.84282011, + "num_input_tokens_seen": 134910505, + "step": 6276, + "time_per_iteration": 2.7725515365600586 + }, + { + "auxiliary_loss_clip": 0.01117196, + "auxiliary_loss_mlp": 0.01085803, + "balance_loss_clip": 1.02514708, + "balance_loss_mlp": 1.00564671, + "epoch": 0.754764624541574, + "flos": 23696159080320.0, + "grad_norm": 2.181700649053519, + "language_loss": 0.78509861, + "learning_rate": 5.982339458593361e-07, + "loss": 0.80712855, + "num_input_tokens_seen": 134930445, + "step": 6277, + "time_per_iteration": 2.6873221397399902 + }, + { + "auxiliary_loss_clip": 0.01126226, + "auxiliary_loss_mlp": 0.0087289, + "balance_loss_clip": 1.02575064, + "balance_loss_mlp": 1.0001123, + "epoch": 0.7548848674322131, + "flos": 25337204766720.0, + "grad_norm": 1.522184850333318, + "language_loss": 0.83902621, + "learning_rate": 5.976784302357767e-07, + "loss": 0.85901731, + "num_input_tokens_seen": 134951010, + "step": 6278, + "time_per_iteration": 2.6986799240112305 + }, + { + "auxiliary_loss_clip": 0.01127338, + "auxiliary_loss_mlp": 0.01085406, + "balance_loss_clip": 1.02619457, + "balance_loss_mlp": 1.00529766, + "epoch": 0.7550051103228521, + "flos": 19573147428480.0, + "grad_norm": 2.8677464269388335, + "language_loss": 0.73476708, + "learning_rate": 5.971231273432855e-07, + "loss": 0.75689447, + "num_input_tokens_seen": 134970495, + "step": 6279, + "time_per_iteration": 2.6657564640045166 + }, + { + "auxiliary_loss_clip": 0.0110746, + "auxiliary_loss_mlp": 0.01079121, + "balance_loss_clip": 1.02027559, + "balance_loss_mlp": 1.00015664, + "epoch": 0.7551253532134913, + "flos": 64150068648960.0, + "grad_norm": 0.8032846491289845, + "language_loss": 0.54606497, + "learning_rate": 5.965680372661e-07, + "loss": 0.5679307, + "num_input_tokens_seen": 135028060, + "step": 6280, + "time_per_iteration": 3.1444427967071533 + }, + { + "auxiliary_loss_clip": 0.01118047, + "auxiliary_loss_mlp": 0.0108333, + "balance_loss_clip": 1.02631783, + "balance_loss_mlp": 1.0033164, + "epoch": 0.7552455961041303, + "flos": 26067986968320.0, + "grad_norm": 2.4206540360030444, + "language_loss": 0.56420791, + "learning_rate": 5.960131600884266e-07, + "loss": 0.58622169, + "num_input_tokens_seen": 135047330, + "step": 6281, + "time_per_iteration": 2.7488858699798584 + }, + { + "auxiliary_loss_clip": 0.0110769, + "auxiliary_loss_mlp": 0.01083238, + "balance_loss_clip": 1.02376723, + "balance_loss_mlp": 1.00322509, + "epoch": 0.7553658389947694, + "flos": 24498223822080.0, + "grad_norm": 1.8790178798933301, + "language_loss": 0.7592411, + "learning_rate": 5.954584958944413e-07, + "loss": 0.7811504, + "num_input_tokens_seen": 135065995, + "step": 6282, + "time_per_iteration": 2.7278292179107666 + }, + { + "auxiliary_loss_clip": 0.01108584, + "auxiliary_loss_mlp": 0.00872879, + "balance_loss_clip": 1.02406359, + "balance_loss_mlp": 1.00013137, + "epoch": 0.7554860818854086, + "flos": 21799465320960.0, + "grad_norm": 2.229695079791432, + "language_loss": 0.81253028, + "learning_rate": 5.949040447682854e-07, + "loss": 0.83234501, + "num_input_tokens_seen": 135085820, + "step": 6283, + "time_per_iteration": 2.748443126678467 + }, + { + "auxiliary_loss_clip": 0.01117187, + "auxiliary_loss_mlp": 0.01085898, + "balance_loss_clip": 1.0250963, + "balance_loss_mlp": 1.00559855, + "epoch": 0.7556063247760476, + "flos": 16362123114240.0, + "grad_norm": 2.0929807729673855, + "language_loss": 0.68160331, + "learning_rate": 5.943498067940686e-07, + "loss": 0.7036342, + "num_input_tokens_seen": 135102845, + "step": 6284, + "time_per_iteration": 2.6506154537200928 + }, + { + "auxiliary_loss_clip": 0.01111017, + "auxiliary_loss_mlp": 0.01084907, + "balance_loss_clip": 1.02191854, + "balance_loss_mlp": 1.00470293, + "epoch": 0.7557265676666867, + "flos": 27235155502080.0, + "grad_norm": 1.8420255990886805, + "language_loss": 0.81766105, + "learning_rate": 5.937957820558686e-07, + "loss": 0.83962029, + "num_input_tokens_seen": 135122190, + "step": 6285, + "time_per_iteration": 2.7451460361480713 + }, + { + "auxiliary_loss_clip": 0.01083023, + "auxiliary_loss_mlp": 0.01078938, + "balance_loss_clip": 1.02065134, + "balance_loss_mlp": 0.99997377, + "epoch": 0.7558468105573258, + "flos": 62189131415040.0, + "grad_norm": 0.8466843717461537, + "language_loss": 0.65398425, + "learning_rate": 5.932419706377296e-07, + "loss": 0.67560387, + "num_input_tokens_seen": 135180495, + "step": 6286, + "time_per_iteration": 3.282148838043213 + }, + { + "auxiliary_loss_clip": 0.01103789, + "auxiliary_loss_mlp": 0.01083348, + "balance_loss_clip": 1.02272522, + "balance_loss_mlp": 1.00328708, + "epoch": 0.7559670534479649, + "flos": 33249078823680.0, + "grad_norm": 2.2250977832081063, + "language_loss": 0.74528849, + "learning_rate": 5.92688372623666e-07, + "loss": 0.76715994, + "num_input_tokens_seen": 135199200, + "step": 6287, + "time_per_iteration": 2.8325676918029785 + }, + { + "auxiliary_loss_clip": 0.01123783, + "auxiliary_loss_mlp": 0.0108381, + "balance_loss_clip": 1.02322817, + "balance_loss_mlp": 1.00360608, + "epoch": 0.7560872963386039, + "flos": 14064379027200.0, + "grad_norm": 2.180264098541242, + "language_loss": 0.74219358, + "learning_rate": 5.921349880976574e-07, + "loss": 0.76426953, + "num_input_tokens_seen": 135217035, + "step": 6288, + "time_per_iteration": 2.766247272491455 + }, + { + "auxiliary_loss_clip": 0.01117634, + "auxiliary_loss_mlp": 0.00872918, + "balance_loss_clip": 1.0242269, + "balance_loss_mlp": 1.0001123, + "epoch": 0.7562075392292431, + "flos": 20412307941120.0, + "grad_norm": 2.0401915489622366, + "language_loss": 0.81881064, + "learning_rate": 5.915818171436515e-07, + "loss": 0.83871615, + "num_input_tokens_seen": 135236370, + "step": 6289, + "time_per_iteration": 2.720522165298462 + }, + { + "auxiliary_loss_clip": 0.01117699, + "auxiliary_loss_mlp": 0.01083932, + "balance_loss_clip": 1.02442575, + "balance_loss_mlp": 1.00377595, + "epoch": 0.7563277821198822, + "flos": 20376792368640.0, + "grad_norm": 1.780893461405154, + "language_loss": 0.74864781, + "learning_rate": 5.910288598455642e-07, + "loss": 0.77066416, + "num_input_tokens_seen": 135255720, + "step": 6290, + "time_per_iteration": 2.729543924331665 + }, + { + "auxiliary_loss_clip": 0.01127128, + "auxiliary_loss_mlp": 0.01085615, + "balance_loss_clip": 1.02581108, + "balance_loss_mlp": 1.00531554, + "epoch": 0.7564480250105212, + "flos": 18588261438720.0, + "grad_norm": 1.9983400664207538, + "language_loss": 0.74892998, + "learning_rate": 5.90476116287278e-07, + "loss": 0.77105743, + "num_input_tokens_seen": 135273320, + "step": 6291, + "time_per_iteration": 2.65274715423584 + }, + { + "auxiliary_loss_clip": 0.01116464, + "auxiliary_loss_mlp": 0.01084934, + "balance_loss_clip": 1.02507436, + "balance_loss_mlp": 1.00487328, + "epoch": 0.7565682679011604, + "flos": 21215521918080.0, + "grad_norm": 1.6984560391402457, + "language_loss": 0.6759783, + "learning_rate": 5.899235865526456e-07, + "loss": 0.69799227, + "num_input_tokens_seen": 135292615, + "step": 6292, + "time_per_iteration": 3.53885817527771 + }, + { + "auxiliary_loss_clip": 0.01110177, + "auxiliary_loss_mlp": 0.01083194, + "balance_loss_clip": 1.0260396, + "balance_loss_mlp": 1.00318074, + "epoch": 0.7566885107917994, + "flos": 20449008662400.0, + "grad_norm": 1.8195452349298409, + "language_loss": 0.82167327, + "learning_rate": 5.893712707254825e-07, + "loss": 0.84360701, + "num_input_tokens_seen": 135310075, + "step": 6293, + "time_per_iteration": 2.747366189956665 + }, + { + "auxiliary_loss_clip": 0.01092151, + "auxiliary_loss_mlp": 0.0108519, + "balance_loss_clip": 1.02210009, + "balance_loss_mlp": 1.0049386, + "epoch": 0.7568087536824385, + "flos": 19025832919680.0, + "grad_norm": 2.4038796838183534, + "language_loss": 0.65707296, + "learning_rate": 5.888191688895769e-07, + "loss": 0.67884636, + "num_input_tokens_seen": 135327335, + "step": 6294, + "time_per_iteration": 3.7378616333007812 + }, + { + "auxiliary_loss_clip": 0.01135397, + "auxiliary_loss_mlp": 0.01083981, + "balance_loss_clip": 1.02593291, + "balance_loss_mlp": 1.00368142, + "epoch": 0.7569289965730777, + "flos": 15225442248960.0, + "grad_norm": 1.91851093473258, + "language_loss": 0.62039089, + "learning_rate": 5.882672811286813e-07, + "loss": 0.64258468, + "num_input_tokens_seen": 135343615, + "step": 6295, + "time_per_iteration": 3.4064722061157227 + }, + { + "auxiliary_loss_clip": 0.01135155, + "auxiliary_loss_mlp": 0.01083998, + "balance_loss_clip": 1.02591932, + "balance_loss_mlp": 1.00374627, + "epoch": 0.7570492394637167, + "flos": 20769367086720.0, + "grad_norm": 2.351942768025112, + "language_loss": 0.69201636, + "learning_rate": 5.877156075265166e-07, + "loss": 0.71420795, + "num_input_tokens_seen": 135359880, + "step": 6296, + "time_per_iteration": 2.6240410804748535 + }, + { + "auxiliary_loss_clip": 0.01118477, + "auxiliary_loss_mlp": 0.01085075, + "balance_loss_clip": 1.02537286, + "balance_loss_mlp": 1.00487065, + "epoch": 0.7571694823543558, + "flos": 15664091137920.0, + "grad_norm": 2.5339429303615995, + "language_loss": 0.69788414, + "learning_rate": 5.871641481667715e-07, + "loss": 0.71991968, + "num_input_tokens_seen": 135374325, + "step": 6297, + "time_per_iteration": 2.6562256813049316 + }, + { + "auxiliary_loss_clip": 0.01096332, + "auxiliary_loss_mlp": 0.01084222, + "balance_loss_clip": 1.02210188, + "balance_loss_mlp": 1.00401783, + "epoch": 0.7572897252449949, + "flos": 25409241492480.0, + "grad_norm": 1.7050321775674304, + "language_loss": 0.84366536, + "learning_rate": 5.866129031331011e-07, + "loss": 0.86547095, + "num_input_tokens_seen": 135393980, + "step": 6298, + "time_per_iteration": 3.757497787475586 + }, + { + "auxiliary_loss_clip": 0.01115273, + "auxiliary_loss_mlp": 0.0108362, + "balance_loss_clip": 1.02375603, + "balance_loss_mlp": 1.00336862, + "epoch": 0.757409968135634, + "flos": 24279348297600.0, + "grad_norm": 1.9827178494275133, + "language_loss": 0.83060241, + "learning_rate": 5.8606187250913e-07, + "loss": 0.85259128, + "num_input_tokens_seen": 135412030, + "step": 6299, + "time_per_iteration": 2.6948373317718506 + }, + { + "auxiliary_loss_clip": 0.01126955, + "auxiliary_loss_mlp": 0.00872899, + "balance_loss_clip": 1.02655828, + "balance_loss_mlp": 1.00011182, + "epoch": 0.757530211026273, + "flos": 24133766474880.0, + "grad_norm": 1.7843254641895012, + "language_loss": 0.83937132, + "learning_rate": 5.855110563784482e-07, + "loss": 0.85936981, + "num_input_tokens_seen": 135430565, + "step": 6300, + "time_per_iteration": 2.693058967590332 + }, + { + "auxiliary_loss_clip": 0.01126176, + "auxiliary_loss_mlp": 0.00872927, + "balance_loss_clip": 1.02463388, + "balance_loss_mlp": 1.00009573, + "epoch": 0.7576504539169122, + "flos": 23951807153280.0, + "grad_norm": 2.218107113459043, + "language_loss": 0.6391511, + "learning_rate": 5.849604548246156e-07, + "loss": 0.65914214, + "num_input_tokens_seen": 135451675, + "step": 6301, + "time_per_iteration": 2.67604398727417 + }, + { + "auxiliary_loss_clip": 0.01117201, + "auxiliary_loss_mlp": 0.00872894, + "balance_loss_clip": 1.0250361, + "balance_loss_mlp": 1.00009739, + "epoch": 0.7577706968075513, + "flos": 21251360712960.0, + "grad_norm": 2.010617008185089, + "language_loss": 0.80298138, + "learning_rate": 5.844100679311565e-07, + "loss": 0.82288229, + "num_input_tokens_seen": 135470635, + "step": 6302, + "time_per_iteration": 2.7622315883636475 + }, + { + "auxiliary_loss_clip": 0.01113655, + "auxiliary_loss_mlp": 0.01083958, + "balance_loss_clip": 1.02359128, + "balance_loss_mlp": 1.0037539, + "epoch": 0.7578909396981903, + "flos": 18296595002880.0, + "grad_norm": 1.988380874552955, + "language_loss": 0.76190603, + "learning_rate": 5.838598957815637e-07, + "loss": 0.78388214, + "num_input_tokens_seen": 135487865, + "step": 6303, + "time_per_iteration": 2.6975317001342773 + }, + { + "auxiliary_loss_clip": 0.0110883, + "auxiliary_loss_mlp": 0.01084119, + "balance_loss_clip": 1.02377164, + "balance_loss_mlp": 1.00396264, + "epoch": 0.7580111825888295, + "flos": 25373869574400.0, + "grad_norm": 1.4181779165756494, + "language_loss": 0.85168451, + "learning_rate": 5.833099384592996e-07, + "loss": 0.87361395, + "num_input_tokens_seen": 135508440, + "step": 6304, + "time_per_iteration": 2.7794344425201416 + }, + { + "auxiliary_loss_clip": 0.01112584, + "auxiliary_loss_mlp": 0.01084186, + "balance_loss_clip": 1.02241504, + "balance_loss_mlp": 1.00402927, + "epoch": 0.7581314254794685, + "flos": 23768662682880.0, + "grad_norm": 2.121791777084704, + "language_loss": 0.71458042, + "learning_rate": 5.827601960477913e-07, + "loss": 0.73654807, + "num_input_tokens_seen": 135526365, + "step": 6305, + "time_per_iteration": 2.7396082878112793 + }, + { + "auxiliary_loss_clip": 0.01125489, + "auxiliary_loss_mlp": 0.01084052, + "balance_loss_clip": 1.02444434, + "balance_loss_mlp": 1.00389576, + "epoch": 0.7582516683701076, + "flos": 22054610603520.0, + "grad_norm": 1.6948089796616548, + "language_loss": 0.7060138, + "learning_rate": 5.822106686304344e-07, + "loss": 0.72810912, + "num_input_tokens_seen": 135545655, + "step": 6306, + "time_per_iteration": 2.6346092224121094 + }, + { + "auxiliary_loss_clip": 0.01105671, + "auxiliary_loss_mlp": 0.01084534, + "balance_loss_clip": 1.02231574, + "balance_loss_mlp": 1.00437784, + "epoch": 0.7583719112607467, + "flos": 31649725848960.0, + "grad_norm": 1.8578115887441111, + "language_loss": 0.57843614, + "learning_rate": 5.816613562905919e-07, + "loss": 0.60033822, + "num_input_tokens_seen": 135566840, + "step": 6307, + "time_per_iteration": 2.8753936290740967 + }, + { + "auxiliary_loss_clip": 0.0110051, + "auxiliary_loss_mlp": 0.01083899, + "balance_loss_clip": 1.02016139, + "balance_loss_mlp": 1.00388598, + "epoch": 0.7584921541513858, + "flos": 33068376478080.0, + "grad_norm": 3.2347155383651742, + "language_loss": 0.69836974, + "learning_rate": 5.811122591115933e-07, + "loss": 0.72021383, + "num_input_tokens_seen": 135587825, + "step": 6308, + "time_per_iteration": 2.8089489936828613 + }, + { + "auxiliary_loss_clip": 0.01101298, + "auxiliary_loss_mlp": 0.01085166, + "balance_loss_clip": 1.02118409, + "balance_loss_mlp": 1.00496244, + "epoch": 0.7586123970420249, + "flos": 23326350606720.0, + "grad_norm": 2.678430197677922, + "language_loss": 0.71110713, + "learning_rate": 5.805633771767376e-07, + "loss": 0.73297179, + "num_input_tokens_seen": 135605220, + "step": 6309, + "time_per_iteration": 2.861872434616089 + }, + { + "auxiliary_loss_clip": 0.01104602, + "auxiliary_loss_mlp": 0.01083841, + "balance_loss_clip": 1.02793932, + "balance_loss_mlp": 1.00373268, + "epoch": 0.7587326399326639, + "flos": 18334229477760.0, + "grad_norm": 1.746441504114744, + "language_loss": 0.77647763, + "learning_rate": 5.800147105692888e-07, + "loss": 0.79836208, + "num_input_tokens_seen": 135624795, + "step": 6310, + "time_per_iteration": 2.670257568359375 + }, + { + "auxiliary_loss_clip": 0.01126518, + "auxiliary_loss_mlp": 0.010842, + "balance_loss_clip": 1.02517343, + "balance_loss_mlp": 1.00409126, + "epoch": 0.7588528828233031, + "flos": 17275080119040.0, + "grad_norm": 2.0099027291444047, + "language_loss": 0.79287517, + "learning_rate": 5.794662593724795e-07, + "loss": 0.8149823, + "num_input_tokens_seen": 135643800, + "step": 6311, + "time_per_iteration": 2.65484619140625 + }, + { + "auxiliary_loss_clip": 0.01136839, + "auxiliary_loss_mlp": 0.01085355, + "balance_loss_clip": 1.02801764, + "balance_loss_mlp": 1.00515151, + "epoch": 0.7589731257139422, + "flos": 17713621267200.0, + "grad_norm": 2.3516931659766995, + "language_loss": 0.75032187, + "learning_rate": 5.789180236695091e-07, + "loss": 0.77254373, + "num_input_tokens_seen": 135660655, + "step": 6312, + "time_per_iteration": 2.547337055206299 + }, + { + "auxiliary_loss_clip": 0.01124548, + "auxiliary_loss_mlp": 0.01083358, + "balance_loss_clip": 1.02501893, + "balance_loss_mlp": 1.00339222, + "epoch": 0.7590933686045812, + "flos": 15961072786560.0, + "grad_norm": 1.8489062107014498, + "language_loss": 0.84969866, + "learning_rate": 5.78370003543544e-07, + "loss": 0.87177777, + "num_input_tokens_seen": 135679410, + "step": 6313, + "time_per_iteration": 2.6756715774536133 + }, + { + "auxiliary_loss_clip": 0.01125745, + "auxiliary_loss_mlp": 0.00872917, + "balance_loss_clip": 1.02558792, + "balance_loss_mlp": 1.00014651, + "epoch": 0.7592136114952204, + "flos": 21068072588160.0, + "grad_norm": 1.8212996543942688, + "language_loss": 0.83408487, + "learning_rate": 5.778221990777203e-07, + "loss": 0.8540715, + "num_input_tokens_seen": 135697150, + "step": 6314, + "time_per_iteration": 2.64333176612854 + }, + { + "auxiliary_loss_clip": 0.01115444, + "auxiliary_loss_mlp": 0.01084282, + "balance_loss_clip": 1.0246737, + "balance_loss_mlp": 1.00412607, + "epoch": 0.7593338543858594, + "flos": 25297666871040.0, + "grad_norm": 2.1067262920500887, + "language_loss": 0.82815969, + "learning_rate": 5.772746103551372e-07, + "loss": 0.85015702, + "num_input_tokens_seen": 135712545, + "step": 6315, + "time_per_iteration": 2.7107813358306885 + }, + { + "auxiliary_loss_clip": 0.01116593, + "auxiliary_loss_mlp": 0.01084032, + "balance_loss_clip": 1.02505541, + "balance_loss_mlp": 1.00387573, + "epoch": 0.7594540972764985, + "flos": 31832367528960.0, + "grad_norm": 2.2298937648331667, + "language_loss": 0.71677899, + "learning_rate": 5.767272374588648e-07, + "loss": 0.73878527, + "num_input_tokens_seen": 135733950, + "step": 6316, + "time_per_iteration": 2.8104031085968018 + }, + { + "auxiliary_loss_clip": 0.01124905, + "auxiliary_loss_mlp": 0.01084181, + "balance_loss_clip": 1.02506423, + "balance_loss_mlp": 1.00402498, + "epoch": 0.7595743401671377, + "flos": 37597250880000.0, + "grad_norm": 1.7439986866523653, + "language_loss": 0.77863359, + "learning_rate": 5.76180080471939e-07, + "loss": 0.80072439, + "num_input_tokens_seen": 135757120, + "step": 6317, + "time_per_iteration": 3.6936981678009033 + }, + { + "auxiliary_loss_clip": 0.0113677, + "auxiliary_loss_mlp": 0.01085331, + "balance_loss_clip": 1.0267812, + "balance_loss_mlp": 1.00498402, + "epoch": 0.7596945830577767, + "flos": 18287724343680.0, + "grad_norm": 2.009545333219528, + "language_loss": 0.72409689, + "learning_rate": 5.756331394773631e-07, + "loss": 0.74631792, + "num_input_tokens_seen": 135773335, + "step": 6318, + "time_per_iteration": 2.6683521270751953 + }, + { + "auxiliary_loss_clip": 0.01081255, + "auxiliary_loss_mlp": 0.0087297, + "balance_loss_clip": 1.01759362, + "balance_loss_mlp": 1.00005937, + "epoch": 0.7598148259484158, + "flos": 22233122219520.0, + "grad_norm": 1.5826111838301193, + "language_loss": 0.75777441, + "learning_rate": 5.750864145581071e-07, + "loss": 0.77731669, + "num_input_tokens_seen": 135792555, + "step": 6319, + "time_per_iteration": 3.9772160053253174 + }, + { + "auxiliary_loss_clip": 0.01135724, + "auxiliary_loss_mlp": 0.01083493, + "balance_loss_clip": 1.02639472, + "balance_loss_mlp": 1.00347948, + "epoch": 0.7599350688390549, + "flos": 27161718145920.0, + "grad_norm": 3.0805036577731593, + "language_loss": 0.86148286, + "learning_rate": 5.745399057971085e-07, + "loss": 0.8836751, + "num_input_tokens_seen": 135813690, + "step": 6320, + "time_per_iteration": 3.6622331142425537 + }, + { + "auxiliary_loss_clip": 0.01128415, + "auxiliary_loss_mlp": 0.01084651, + "balance_loss_clip": 1.02686214, + "balance_loss_mlp": 1.00439954, + "epoch": 0.760055311729694, + "flos": 15560704817280.0, + "grad_norm": 2.197786279080456, + "language_loss": 0.75844747, + "learning_rate": 5.739936132772738e-07, + "loss": 0.78057814, + "num_input_tokens_seen": 135832255, + "step": 6321, + "time_per_iteration": 2.6884257793426514 + }, + { + "auxiliary_loss_clip": 0.01134701, + "auxiliary_loss_mlp": 0.01084094, + "balance_loss_clip": 1.02582002, + "balance_loss_mlp": 1.0039382, + "epoch": 0.760175554620333, + "flos": 25155496840320.0, + "grad_norm": 1.9456791281272308, + "language_loss": 0.74298441, + "learning_rate": 5.734475370814733e-07, + "loss": 0.76517236, + "num_input_tokens_seen": 135851935, + "step": 6322, + "time_per_iteration": 2.6324446201324463 + }, + { + "auxiliary_loss_clip": 0.01125689, + "auxiliary_loss_mlp": 0.01084126, + "balance_loss_clip": 1.02464044, + "balance_loss_mlp": 1.00396967, + "epoch": 0.7602957975109722, + "flos": 24353791234560.0, + "grad_norm": 1.6473815321147807, + "language_loss": 0.78798735, + "learning_rate": 5.729016772925483e-07, + "loss": 0.81008554, + "num_input_tokens_seen": 135873510, + "step": 6323, + "time_per_iteration": 2.7572755813598633 + }, + { + "auxiliary_loss_clip": 0.01091557, + "auxiliary_loss_mlp": 0.0108363, + "balance_loss_clip": 1.01971495, + "balance_loss_mlp": 1.00347352, + "epoch": 0.7604160404016113, + "flos": 25192664438400.0, + "grad_norm": 1.7605067671270058, + "language_loss": 0.70460558, + "learning_rate": 5.723560339933038e-07, + "loss": 0.72635746, + "num_input_tokens_seen": 135893845, + "step": 6324, + "time_per_iteration": 3.7396514415740967 + }, + { + "auxiliary_loss_clip": 0.01128065, + "auxiliary_loss_mlp": 0.00872831, + "balance_loss_clip": 1.02623582, + "balance_loss_mlp": 1.00007617, + "epoch": 0.7605362832922503, + "flos": 29861841363840.0, + "grad_norm": 2.526878998276759, + "language_loss": 0.65177733, + "learning_rate": 5.71810607266513e-07, + "loss": 0.67178631, + "num_input_tokens_seen": 135912430, + "step": 6325, + "time_per_iteration": 2.6923322677612305 + }, + { + "auxiliary_loss_clip": 0.01126219, + "auxiliary_loss_mlp": 0.0108501, + "balance_loss_clip": 1.02527952, + "balance_loss_mlp": 1.00485349, + "epoch": 0.7606565261828895, + "flos": 13917935278080.0, + "grad_norm": 1.6415352976320197, + "language_loss": 0.60703957, + "learning_rate": 5.712653971949184e-07, + "loss": 0.62915182, + "num_input_tokens_seen": 135930550, + "step": 6326, + "time_per_iteration": 2.685624361038208 + }, + { + "auxiliary_loss_clip": 0.01128137, + "auxiliary_loss_mlp": 0.01084507, + "balance_loss_clip": 1.02645254, + "balance_loss_mlp": 1.00439787, + "epoch": 0.7607767690735285, + "flos": 18551273408640.0, + "grad_norm": 2.913933963316248, + "language_loss": 0.76134634, + "learning_rate": 5.707204038612268e-07, + "loss": 0.78347284, + "num_input_tokens_seen": 135947980, + "step": 6327, + "time_per_iteration": 2.6025002002716064 + }, + { + "auxiliary_loss_clip": 0.01116957, + "auxiliary_loss_mlp": 0.01086263, + "balance_loss_clip": 1.02526617, + "balance_loss_mlp": 1.00596368, + "epoch": 0.7608970119641676, + "flos": 20922993555840.0, + "grad_norm": 2.0855207294136484, + "language_loss": 0.74080044, + "learning_rate": 5.701756273481138e-07, + "loss": 0.76283264, + "num_input_tokens_seen": 135965400, + "step": 6328, + "time_per_iteration": 2.695769786834717 + }, + { + "auxiliary_loss_clip": 0.01102658, + "auxiliary_loss_mlp": 0.01084475, + "balance_loss_clip": 1.02572536, + "balance_loss_mlp": 1.00441408, + "epoch": 0.7610172548548068, + "flos": 23807302738560.0, + "grad_norm": 1.427828611046014, + "language_loss": 0.73944181, + "learning_rate": 5.696310677382212e-07, + "loss": 0.7613132, + "num_input_tokens_seen": 135986795, + "step": 6329, + "time_per_iteration": 2.70743465423584 + }, + { + "auxiliary_loss_clip": 0.01087191, + "auxiliary_loss_mlp": 0.01079629, + "balance_loss_clip": 1.02522469, + "balance_loss_mlp": 1.00066471, + "epoch": 0.7611374977454458, + "flos": 66496580426880.0, + "grad_norm": 0.9674106396721702, + "language_loss": 0.61836219, + "learning_rate": 5.690867251141576e-07, + "loss": 0.64003038, + "num_input_tokens_seen": 136053450, + "step": 6330, + "time_per_iteration": 3.4547595977783203 + }, + { + "auxiliary_loss_clip": 0.0111112, + "auxiliary_loss_mlp": 0.01084088, + "balance_loss_clip": 1.02581525, + "balance_loss_mlp": 1.00388396, + "epoch": 0.7612577406360849, + "flos": 15633136592640.0, + "grad_norm": 4.469956205976149, + "language_loss": 0.92162853, + "learning_rate": 5.685425995585013e-07, + "loss": 0.94358063, + "num_input_tokens_seen": 136071375, + "step": 6331, + "time_per_iteration": 2.7156176567077637 + }, + { + "auxiliary_loss_clip": 0.01100177, + "auxiliary_loss_mlp": 0.01079267, + "balance_loss_clip": 1.02075887, + "balance_loss_mlp": 1.00030255, + "epoch": 0.761377983526724, + "flos": 60526253237760.0, + "grad_norm": 0.754743946963431, + "language_loss": 0.59078664, + "learning_rate": 5.679986911537935e-07, + "loss": 0.61258107, + "num_input_tokens_seen": 136138905, + "step": 6332, + "time_per_iteration": 3.3981401920318604 + }, + { + "auxiliary_loss_clip": 0.01084361, + "auxiliary_loss_mlp": 0.01085295, + "balance_loss_clip": 1.01830816, + "balance_loss_mlp": 1.00509071, + "epoch": 0.7614982264173631, + "flos": 35772522019200.0, + "grad_norm": 1.870990945135462, + "language_loss": 0.67020965, + "learning_rate": 5.674549999825462e-07, + "loss": 0.69190627, + "num_input_tokens_seen": 136161720, + "step": 6333, + "time_per_iteration": 2.9202675819396973 + }, + { + "auxiliary_loss_clip": 0.01107734, + "auxiliary_loss_mlp": 0.01079331, + "balance_loss_clip": 1.02042365, + "balance_loss_mlp": 1.00036657, + "epoch": 0.7616184693080021, + "flos": 67925502345600.0, + "grad_norm": 0.9129746169015873, + "language_loss": 0.71480983, + "learning_rate": 5.669115261272363e-07, + "loss": 0.73668051, + "num_input_tokens_seen": 136222040, + "step": 6334, + "time_per_iteration": 3.275831460952759 + }, + { + "auxiliary_loss_clip": 0.01126, + "auxiliary_loss_mlp": 0.01084132, + "balance_loss_clip": 1.02537441, + "balance_loss_mlp": 1.00392795, + "epoch": 0.7617387121986413, + "flos": 20521979141760.0, + "grad_norm": 2.217690061515714, + "language_loss": 0.7307775, + "learning_rate": 5.663682696703081e-07, + "loss": 0.75287879, + "num_input_tokens_seen": 136240305, + "step": 6335, + "time_per_iteration": 2.646862268447876 + }, + { + "auxiliary_loss_clip": 0.01135595, + "auxiliary_loss_mlp": 0.01083961, + "balance_loss_clip": 1.02619362, + "balance_loss_mlp": 1.00380445, + "epoch": 0.7618589550892804, + "flos": 18624495283200.0, + "grad_norm": 1.97676966964292, + "language_loss": 0.8226428, + "learning_rate": 5.658252306941746e-07, + "loss": 0.84483832, + "num_input_tokens_seen": 136259625, + "step": 6336, + "time_per_iteration": 2.6200883388519287 + }, + { + "auxiliary_loss_clip": 0.01095377, + "auxiliary_loss_mlp": 0.01084539, + "balance_loss_clip": 1.02215183, + "balance_loss_mlp": 1.00423932, + "epoch": 0.7619791979799194, + "flos": 17453735389440.0, + "grad_norm": 2.3751306806052184, + "language_loss": 0.75522554, + "learning_rate": 5.65282409281212e-07, + "loss": 0.77702469, + "num_input_tokens_seen": 136277090, + "step": 6337, + "time_per_iteration": 2.7430851459503174 + }, + { + "auxiliary_loss_clip": 0.01116232, + "auxiliary_loss_mlp": 0.01084114, + "balance_loss_clip": 1.02407193, + "balance_loss_mlp": 1.00391042, + "epoch": 0.7620994408705585, + "flos": 14137421333760.0, + "grad_norm": 2.1392881676518045, + "language_loss": 0.70138848, + "learning_rate": 5.64739805513768e-07, + "loss": 0.72339195, + "num_input_tokens_seen": 136294635, + "step": 6338, + "time_per_iteration": 2.6984171867370605 + }, + { + "auxiliary_loss_clip": 0.01106385, + "auxiliary_loss_mlp": 0.00872913, + "balance_loss_clip": 1.01958585, + "balance_loss_mlp": 1.00123775, + "epoch": 0.7622196837611976, + "flos": 70708792527360.0, + "grad_norm": 0.7868193696129135, + "language_loss": 0.55674338, + "learning_rate": 5.641974194741541e-07, + "loss": 0.57653642, + "num_input_tokens_seen": 136350320, + "step": 6339, + "time_per_iteration": 3.167128324508667 + }, + { + "auxiliary_loss_clip": 0.01085843, + "auxiliary_loss_mlp": 0.01079938, + "balance_loss_clip": 1.0166353, + "balance_loss_mlp": 1.00097358, + "epoch": 0.7623399266518367, + "flos": 60684150447360.0, + "grad_norm": 0.7702823583992706, + "language_loss": 0.63729119, + "learning_rate": 5.636552512446502e-07, + "loss": 0.6589489, + "num_input_tokens_seen": 136411375, + "step": 6340, + "time_per_iteration": 3.188917636871338 + }, + { + "auxiliary_loss_clip": 0.01118753, + "auxiliary_loss_mlp": 0.01084429, + "balance_loss_clip": 1.02465892, + "balance_loss_mlp": 1.00427246, + "epoch": 0.7624601695424758, + "flos": 26468893641600.0, + "grad_norm": 1.6707034820065856, + "language_loss": 0.77811837, + "learning_rate": 5.631133009075027e-07, + "loss": 0.80015016, + "num_input_tokens_seen": 136430560, + "step": 6341, + "time_per_iteration": 2.725698709487915 + }, + { + "auxiliary_loss_clip": 0.0112577, + "auxiliary_loss_mlp": 0.00872865, + "balance_loss_clip": 1.02490091, + "balance_loss_mlp": 1.00008667, + "epoch": 0.7625804124331149, + "flos": 19135755515520.0, + "grad_norm": 1.922147429296596, + "language_loss": 0.68460476, + "learning_rate": 5.625715685449242e-07, + "loss": 0.70459116, + "num_input_tokens_seen": 136448665, + "step": 6342, + "time_per_iteration": 2.6379923820495605 + }, + { + "auxiliary_loss_clip": 0.01084523, + "auxiliary_loss_mlp": 0.01083897, + "balance_loss_clip": 1.02173567, + "balance_loss_mlp": 1.00378835, + "epoch": 0.762700655323754, + "flos": 26213101914240.0, + "grad_norm": 1.541161095737405, + "language_loss": 0.71541566, + "learning_rate": 5.620300542390966e-07, + "loss": 0.73709983, + "num_input_tokens_seen": 136469710, + "step": 6343, + "time_per_iteration": 3.6160969734191895 + }, + { + "auxiliary_loss_clip": 0.01101617, + "auxiliary_loss_mlp": 0.01085063, + "balance_loss_clip": 1.02526653, + "balance_loss_mlp": 1.00490713, + "epoch": 0.762820898214393, + "flos": 22382582711040.0, + "grad_norm": 1.6375406184221073, + "language_loss": 0.85087854, + "learning_rate": 5.614887580721659e-07, + "loss": 0.87274534, + "num_input_tokens_seen": 136489855, + "step": 6344, + "time_per_iteration": 3.792903184890747 + }, + { + "auxiliary_loss_clip": 0.01098931, + "auxiliary_loss_mlp": 0.01084644, + "balance_loss_clip": 1.02420545, + "balance_loss_mlp": 1.00444007, + "epoch": 0.7629411411050322, + "flos": 15700504550400.0, + "grad_norm": 2.2174274563753156, + "language_loss": 0.7347725, + "learning_rate": 5.609476801262481e-07, + "loss": 0.75660825, + "num_input_tokens_seen": 136504715, + "step": 6345, + "time_per_iteration": 2.693636417388916 + }, + { + "auxiliary_loss_clip": 0.01103138, + "auxiliary_loss_mlp": 0.01084264, + "balance_loss_clip": 1.02175665, + "balance_loss_mlp": 1.00410795, + "epoch": 0.7630613839956712, + "flos": 13770342293760.0, + "grad_norm": 3.616445844695063, + "language_loss": 0.6400609, + "learning_rate": 5.604068204834223e-07, + "loss": 0.66193491, + "num_input_tokens_seen": 136521610, + "step": 6346, + "time_per_iteration": 3.767080545425415 + }, + { + "auxiliary_loss_clip": 0.01090523, + "auxiliary_loss_mlp": 0.00872959, + "balance_loss_clip": 1.0189842, + "balance_loss_mlp": 1.00006676, + "epoch": 0.7631816268863103, + "flos": 14569569861120.0, + "grad_norm": 2.036840732816648, + "language_loss": 0.76889658, + "learning_rate": 5.598661792257367e-07, + "loss": 0.78853142, + "num_input_tokens_seen": 136538655, + "step": 6347, + "time_per_iteration": 2.823991060256958 + }, + { + "auxiliary_loss_clip": 0.0112709, + "auxiliary_loss_mlp": 0.01083648, + "balance_loss_clip": 1.02564704, + "balance_loss_mlp": 1.00353932, + "epoch": 0.7633018697769495, + "flos": 19062210418560.0, + "grad_norm": 1.7651433325052774, + "language_loss": 0.75751358, + "learning_rate": 5.593257564352071e-07, + "loss": 0.77962095, + "num_input_tokens_seen": 136557095, + "step": 6348, + "time_per_iteration": 2.7517101764678955 + }, + { + "auxiliary_loss_clip": 0.01125305, + "auxiliary_loss_mlp": 0.01084572, + "balance_loss_clip": 1.02503538, + "balance_loss_mlp": 1.00451136, + "epoch": 0.7634221126675885, + "flos": 22052958577920.0, + "grad_norm": 1.6859609797527035, + "language_loss": 0.75555158, + "learning_rate": 5.58785552193815e-07, + "loss": 0.77765036, + "num_input_tokens_seen": 136577340, + "step": 6349, + "time_per_iteration": 3.4952094554901123 + }, + { + "auxiliary_loss_clip": 0.01134953, + "auxiliary_loss_mlp": 0.01084433, + "balance_loss_clip": 1.02569687, + "balance_loss_mlp": 1.00432491, + "epoch": 0.7635423555582276, + "flos": 29382720825600.0, + "grad_norm": 2.0867007821165573, + "language_loss": 0.75605828, + "learning_rate": 5.582455665835086e-07, + "loss": 0.77825212, + "num_input_tokens_seen": 136597635, + "step": 6350, + "time_per_iteration": 2.6885313987731934 + }, + { + "auxiliary_loss_clip": 0.01118713, + "auxiliary_loss_mlp": 0.01084122, + "balance_loss_clip": 1.02499366, + "balance_loss_mlp": 1.00387001, + "epoch": 0.7636625984488667, + "flos": 17784903807360.0, + "grad_norm": 2.3288867982857147, + "language_loss": 0.72131258, + "learning_rate": 5.577057996862036e-07, + "loss": 0.74334091, + "num_input_tokens_seen": 136615260, + "step": 6351, + "time_per_iteration": 2.6751272678375244 + }, + { + "auxiliary_loss_clip": 0.01135312, + "auxiliary_loss_mlp": 0.01084144, + "balance_loss_clip": 1.02643764, + "balance_loss_mlp": 1.00403547, + "epoch": 0.7637828413395058, + "flos": 23734583654400.0, + "grad_norm": 1.5564936476243882, + "language_loss": 0.76082069, + "learning_rate": 5.571662515837814e-07, + "loss": 0.78301525, + "num_input_tokens_seen": 136637220, + "step": 6352, + "time_per_iteration": 2.6422083377838135 + }, + { + "auxiliary_loss_clip": 0.01116218, + "auxiliary_loss_mlp": 0.01082876, + "balance_loss_clip": 1.02474642, + "balance_loss_mlp": 1.00281477, + "epoch": 0.7639030842301449, + "flos": 36283279461120.0, + "grad_norm": 1.7751582295288855, + "language_loss": 0.83484817, + "learning_rate": 5.566269223580926e-07, + "loss": 0.85683906, + "num_input_tokens_seen": 136658930, + "step": 6353, + "time_per_iteration": 2.84272837638855 + }, + { + "auxiliary_loss_clip": 0.01128016, + "auxiliary_loss_mlp": 0.01084285, + "balance_loss_clip": 1.02648616, + "balance_loss_mlp": 1.0042243, + "epoch": 0.764023327120784, + "flos": 28878104609280.0, + "grad_norm": 2.3392317841805603, + "language_loss": 0.75146073, + "learning_rate": 5.560878120909511e-07, + "loss": 0.77358377, + "num_input_tokens_seen": 136681530, + "step": 6354, + "time_per_iteration": 2.686898946762085 + }, + { + "auxiliary_loss_clip": 0.01107761, + "auxiliary_loss_mlp": 0.01078991, + "balance_loss_clip": 1.02028751, + "balance_loss_mlp": 1.00002706, + "epoch": 0.7641435700114231, + "flos": 64789711067520.0, + "grad_norm": 0.8447934551236296, + "language_loss": 0.5864656, + "learning_rate": 5.55548920864141e-07, + "loss": 0.60833311, + "num_input_tokens_seen": 136742185, + "step": 6355, + "time_per_iteration": 3.2402825355529785 + }, + { + "auxiliary_loss_clip": 0.01126066, + "auxiliary_loss_mlp": 0.01086285, + "balance_loss_clip": 1.0264411, + "balance_loss_mlp": 1.00608063, + "epoch": 0.7642638129020621, + "flos": 16835784785280.0, + "grad_norm": 1.6487665556764735, + "language_loss": 0.77846599, + "learning_rate": 5.550102487594113e-07, + "loss": 0.80058956, + "num_input_tokens_seen": 136760855, + "step": 6356, + "time_per_iteration": 2.675767183303833 + }, + { + "auxiliary_loss_clip": 0.0109774, + "auxiliary_loss_mlp": 0.00872767, + "balance_loss_clip": 1.02249861, + "balance_loss_mlp": 1.00008345, + "epoch": 0.7643840557927013, + "flos": 30408940391040.0, + "grad_norm": 1.5753214229460408, + "language_loss": 0.71519387, + "learning_rate": 5.54471795858477e-07, + "loss": 0.73489898, + "num_input_tokens_seen": 136780925, + "step": 6357, + "time_per_iteration": 2.8237602710723877 + }, + { + "auxiliary_loss_clip": 0.01110343, + "auxiliary_loss_mlp": 0.01083736, + "balance_loss_clip": 1.02485776, + "balance_loss_mlp": 1.00357974, + "epoch": 0.7645042986833404, + "flos": 16983234115200.0, + "grad_norm": 2.2704720795639135, + "language_loss": 0.8291297, + "learning_rate": 5.539335622430235e-07, + "loss": 0.8510704, + "num_input_tokens_seen": 136799545, + "step": 6358, + "time_per_iteration": 2.747982978820801 + }, + { + "auxiliary_loss_clip": 0.01126505, + "auxiliary_loss_mlp": 0.01083746, + "balance_loss_clip": 1.02507949, + "balance_loss_mlp": 1.00354171, + "epoch": 0.7646245415739794, + "flos": 17311493531520.0, + "grad_norm": 2.237596174867011, + "language_loss": 0.74744409, + "learning_rate": 5.533955479946975e-07, + "loss": 0.76954663, + "num_input_tokens_seen": 136818325, + "step": 6359, + "time_per_iteration": 2.639024257659912 + }, + { + "auxiliary_loss_clip": 0.01061311, + "auxiliary_loss_mlp": 0.00873173, + "balance_loss_clip": 1.0171355, + "balance_loss_mlp": 1.0015322, + "epoch": 0.7647447844646186, + "flos": 70402332666240.0, + "grad_norm": 0.9020775213958896, + "language_loss": 0.65827346, + "learning_rate": 5.528577531951173e-07, + "loss": 0.67761827, + "num_input_tokens_seen": 136878730, + "step": 6360, + "time_per_iteration": 3.341142177581787 + }, + { + "auxiliary_loss_clip": 0.0111845, + "auxiliary_loss_mlp": 0.01084565, + "balance_loss_clip": 1.02565992, + "balance_loss_mlp": 1.00445592, + "epoch": 0.7648650273552576, + "flos": 17675914965120.0, + "grad_norm": 1.918467193302372, + "language_loss": 0.73793191, + "learning_rate": 5.523201779258653e-07, + "loss": 0.75996208, + "num_input_tokens_seen": 136897705, + "step": 6361, + "time_per_iteration": 2.6949002742767334 + }, + { + "auxiliary_loss_clip": 0.01134654, + "auxiliary_loss_mlp": 0.01083648, + "balance_loss_clip": 1.02534866, + "balance_loss_mlp": 1.00353968, + "epoch": 0.7649852702458967, + "flos": 22162019247360.0, + "grad_norm": 1.7448447326975869, + "language_loss": 0.83922112, + "learning_rate": 5.517828222684912e-07, + "loss": 0.86140418, + "num_input_tokens_seen": 136918360, + "step": 6362, + "time_per_iteration": 2.67836856842041 + }, + { + "auxiliary_loss_clip": 0.01098161, + "auxiliary_loss_mlp": 0.01078855, + "balance_loss_clip": 1.01937759, + "balance_loss_mlp": 0.99989063, + "epoch": 0.7651055131365359, + "flos": 69848338227840.0, + "grad_norm": 0.7578387407603703, + "language_loss": 0.59059358, + "learning_rate": 5.512456863045117e-07, + "loss": 0.61236376, + "num_input_tokens_seen": 136979050, + "step": 6363, + "time_per_iteration": 3.385265350341797 + }, + { + "auxiliary_loss_clip": 0.01135617, + "auxiliary_loss_mlp": 0.01084307, + "balance_loss_clip": 1.0258801, + "balance_loss_mlp": 1.00415087, + "epoch": 0.7652257560271749, + "flos": 19464014931840.0, + "grad_norm": 1.717338822363016, + "language_loss": 0.73879844, + "learning_rate": 5.507087701154089e-07, + "loss": 0.76099765, + "num_input_tokens_seen": 136998970, + "step": 6364, + "time_per_iteration": 2.6105141639709473 + }, + { + "auxiliary_loss_clip": 0.01099876, + "auxiliary_loss_mlp": 0.0108412, + "balance_loss_clip": 1.02433026, + "balance_loss_mlp": 1.00391603, + "epoch": 0.765345998917814, + "flos": 15961108700160.0, + "grad_norm": 3.521418622748855, + "language_loss": 0.7520324, + "learning_rate": 5.50172073782634e-07, + "loss": 0.77387238, + "num_input_tokens_seen": 137016950, + "step": 6365, + "time_per_iteration": 2.7610526084899902 + }, + { + "auxiliary_loss_clip": 0.01101194, + "auxiliary_loss_mlp": 0.01084445, + "balance_loss_clip": 1.02466166, + "balance_loss_mlp": 1.00428903, + "epoch": 0.7654662418084531, + "flos": 23659853408640.0, + "grad_norm": 2.632569675066839, + "language_loss": 0.87672663, + "learning_rate": 5.496355973876023e-07, + "loss": 0.89858294, + "num_input_tokens_seen": 137036205, + "step": 6366, + "time_per_iteration": 2.8773574829101562 + }, + { + "auxiliary_loss_clip": 0.01109945, + "auxiliary_loss_mlp": 0.00873035, + "balance_loss_clip": 1.02519393, + "balance_loss_mlp": 1.00004745, + "epoch": 0.7655864846990922, + "flos": 41463608878080.0, + "grad_norm": 2.2551108954830856, + "language_loss": 0.71042281, + "learning_rate": 5.490993410116984e-07, + "loss": 0.73025256, + "num_input_tokens_seen": 137059195, + "step": 6367, + "time_per_iteration": 2.94512939453125 + }, + { + "auxiliary_loss_clip": 0.01104838, + "auxiliary_loss_mlp": 0.01084355, + "balance_loss_clip": 1.02235055, + "balance_loss_mlp": 1.00434184, + "epoch": 0.7657067275897312, + "flos": 43142684088960.0, + "grad_norm": 1.658720414080715, + "language_loss": 0.69520307, + "learning_rate": 5.485633047362704e-07, + "loss": 0.71709502, + "num_input_tokens_seen": 137081200, + "step": 6368, + "time_per_iteration": 3.816046953201294 + }, + { + "auxiliary_loss_clip": 0.01138188, + "auxiliary_loss_mlp": 0.01084158, + "balance_loss_clip": 1.02890754, + "balance_loss_mlp": 1.00395441, + "epoch": 0.7658269704803703, + "flos": 17311780840320.0, + "grad_norm": 1.9436071765122014, + "language_loss": 0.78435349, + "learning_rate": 5.480274886426341e-07, + "loss": 0.80657697, + "num_input_tokens_seen": 137097840, + "step": 6369, + "time_per_iteration": 2.6011300086975098 + }, + { + "auxiliary_loss_clip": 0.0112104, + "auxiliary_loss_mlp": 0.01084164, + "balance_loss_clip": 1.02276158, + "balance_loss_mlp": 1.00415111, + "epoch": 0.7659472133710095, + "flos": 12568160977920.0, + "grad_norm": 1.9389247216910306, + "language_loss": 0.7801199, + "learning_rate": 5.474918928120744e-07, + "loss": 0.80217195, + "num_input_tokens_seen": 137114335, + "step": 6370, + "time_per_iteration": 3.6705198287963867 + }, + { + "auxiliary_loss_clip": 0.01126169, + "auxiliary_loss_mlp": 0.01083775, + "balance_loss_clip": 1.0259918, + "balance_loss_mlp": 1.00371361, + "epoch": 0.7660674562616485, + "flos": 22707430335360.0, + "grad_norm": 1.515462110118945, + "language_loss": 0.87165332, + "learning_rate": 5.469565173258392e-07, + "loss": 0.89375281, + "num_input_tokens_seen": 137132850, + "step": 6371, + "time_per_iteration": 2.6577348709106445 + }, + { + "auxiliary_loss_clip": 0.01135946, + "auxiliary_loss_mlp": 0.0108407, + "balance_loss_clip": 1.026016, + "balance_loss_mlp": 1.00386608, + "epoch": 0.7661876991522876, + "flos": 17056455989760.0, + "grad_norm": 1.7262926788284947, + "language_loss": 0.63784832, + "learning_rate": 5.464213622651454e-07, + "loss": 0.66004843, + "num_input_tokens_seen": 137150665, + "step": 6372, + "time_per_iteration": 3.467186450958252 + }, + { + "auxiliary_loss_clip": 0.01106779, + "auxiliary_loss_mlp": 0.0108394, + "balance_loss_clip": 1.02366996, + "balance_loss_mlp": 1.0037837, + "epoch": 0.7663079420429267, + "flos": 20084228092800.0, + "grad_norm": 1.750222370645917, + "language_loss": 0.84357923, + "learning_rate": 5.458864277111753e-07, + "loss": 0.86548638, + "num_input_tokens_seen": 137168500, + "step": 6373, + "time_per_iteration": 2.806126832962036 + }, + { + "auxiliary_loss_clip": 0.01116593, + "auxiliary_loss_mlp": 0.00872885, + "balance_loss_clip": 1.02521265, + "balance_loss_mlp": 1.0001018, + "epoch": 0.7664281849335658, + "flos": 12677473042560.0, + "grad_norm": 2.2028808205979558, + "language_loss": 0.69033146, + "learning_rate": 5.453517137450769e-07, + "loss": 0.71022624, + "num_input_tokens_seen": 137185075, + "step": 6374, + "time_per_iteration": 3.6444437503814697 + }, + { + "auxiliary_loss_clip": 0.01126209, + "auxiliary_loss_mlp": 0.01083476, + "balance_loss_clip": 1.02595663, + "balance_loss_mlp": 1.00331926, + "epoch": 0.7665484278242048, + "flos": 22345271458560.0, + "grad_norm": 1.8808199768630562, + "language_loss": 0.7611326, + "learning_rate": 5.448172204479684e-07, + "loss": 0.78322947, + "num_input_tokens_seen": 137204355, + "step": 6375, + "time_per_iteration": 2.711557626724243 + }, + { + "auxiliary_loss_clip": 0.01135405, + "auxiliary_loss_mlp": 0.01083813, + "balance_loss_clip": 1.02642202, + "balance_loss_mlp": 1.00375199, + "epoch": 0.766668670714844, + "flos": 23617909301760.0, + "grad_norm": 1.9503318619805867, + "language_loss": 0.74625766, + "learning_rate": 5.442829479009294e-07, + "loss": 0.76844978, + "num_input_tokens_seen": 137223135, + "step": 6376, + "time_per_iteration": 2.660600423812866 + }, + { + "auxiliary_loss_clip": 0.01111233, + "auxiliary_loss_mlp": 0.01084964, + "balance_loss_clip": 1.0261668, + "balance_loss_mlp": 1.00471234, + "epoch": 0.7667889136054831, + "flos": 19427134642560.0, + "grad_norm": 1.7699214737146993, + "language_loss": 0.71286535, + "learning_rate": 5.437488961850103e-07, + "loss": 0.7348274, + "num_input_tokens_seen": 137242935, + "step": 6377, + "time_per_iteration": 2.707268238067627 + }, + { + "auxiliary_loss_clip": 0.01095267, + "auxiliary_loss_mlp": 0.01084347, + "balance_loss_clip": 1.02283251, + "balance_loss_mlp": 1.00428605, + "epoch": 0.7669091564961221, + "flos": 26866352609280.0, + "grad_norm": 1.774247988879006, + "language_loss": 0.75453997, + "learning_rate": 5.432150653812258e-07, + "loss": 0.77633613, + "num_input_tokens_seen": 137262970, + "step": 6378, + "time_per_iteration": 2.955296516418457 + }, + { + "auxiliary_loss_clip": 0.01125375, + "auxiliary_loss_mlp": 0.01084218, + "balance_loss_clip": 1.02516603, + "balance_loss_mlp": 1.00410903, + "epoch": 0.7670293993867613, + "flos": 12385303816320.0, + "grad_norm": 2.9361440498594633, + "language_loss": 0.82464188, + "learning_rate": 5.42681455570557e-07, + "loss": 0.8467378, + "num_input_tokens_seen": 137279500, + "step": 6379, + "time_per_iteration": 2.655406951904297 + }, + { + "auxiliary_loss_clip": 0.0113421, + "auxiliary_loss_mlp": 0.01084351, + "balance_loss_clip": 1.02501118, + "balance_loss_mlp": 1.00419462, + "epoch": 0.7671496422774003, + "flos": 21762944167680.0, + "grad_norm": 1.688332059711195, + "language_loss": 0.6474188, + "learning_rate": 5.42148066833954e-07, + "loss": 0.66960442, + "num_input_tokens_seen": 137298745, + "step": 6380, + "time_per_iteration": 2.654627799987793 + }, + { + "auxiliary_loss_clip": 0.011355, + "auxiliary_loss_mlp": 0.01084668, + "balance_loss_clip": 1.02618361, + "balance_loss_mlp": 1.00455952, + "epoch": 0.7672698851680394, + "flos": 21069221823360.0, + "grad_norm": 2.273150915145667, + "language_loss": 0.75382119, + "learning_rate": 5.416148992523289e-07, + "loss": 0.77602291, + "num_input_tokens_seen": 137317320, + "step": 6381, + "time_per_iteration": 2.6283435821533203 + }, + { + "auxiliary_loss_clip": 0.01079869, + "auxiliary_loss_mlp": 0.01083143, + "balance_loss_clip": 1.02329612, + "balance_loss_mlp": 1.00303483, + "epoch": 0.7673901280586786, + "flos": 16976697840000.0, + "grad_norm": 2.156167020276999, + "language_loss": 0.78270835, + "learning_rate": 5.410819529065644e-07, + "loss": 0.80433846, + "num_input_tokens_seen": 137335275, + "step": 6382, + "time_per_iteration": 2.8387112617492676 + }, + { + "auxiliary_loss_clip": 0.01097527, + "auxiliary_loss_mlp": 0.01084336, + "balance_loss_clip": 1.02271688, + "balance_loss_mlp": 1.00427532, + "epoch": 0.7675103709493176, + "flos": 29242669697280.0, + "grad_norm": 2.274183740823519, + "language_loss": 0.65398765, + "learning_rate": 5.405492278775079e-07, + "loss": 0.67580628, + "num_input_tokens_seen": 137355055, + "step": 6383, + "time_per_iteration": 2.8138716220855713 + }, + { + "auxiliary_loss_clip": 0.01118531, + "auxiliary_loss_mlp": 0.01083788, + "balance_loss_clip": 1.02444232, + "balance_loss_mlp": 1.00363171, + "epoch": 0.7676306138399567, + "flos": 29023004073600.0, + "grad_norm": 2.0654945688711153, + "language_loss": 0.79905891, + "learning_rate": 5.400167242459732e-07, + "loss": 0.82108212, + "num_input_tokens_seen": 137374015, + "step": 6384, + "time_per_iteration": 2.7438313961029053 + }, + { + "auxiliary_loss_clip": 0.01123291, + "auxiliary_loss_mlp": 0.01084805, + "balance_loss_clip": 1.02417541, + "balance_loss_mlp": 1.00464821, + "epoch": 0.7677508567305958, + "flos": 22565116650240.0, + "grad_norm": 1.6699080458659712, + "language_loss": 0.80702126, + "learning_rate": 5.394844420927405e-07, + "loss": 0.82910216, + "num_input_tokens_seen": 137393625, + "step": 6385, + "time_per_iteration": 2.6851398944854736 + }, + { + "auxiliary_loss_clip": 0.0113489, + "auxiliary_loss_mlp": 0.0108428, + "balance_loss_clip": 1.02598476, + "balance_loss_mlp": 1.00412416, + "epoch": 0.7678710996212349, + "flos": 25411432222080.0, + "grad_norm": 1.7926610058377321, + "language_loss": 0.73163426, + "learning_rate": 5.389523814985562e-07, + "loss": 0.75382602, + "num_input_tokens_seen": 137413045, + "step": 6386, + "time_per_iteration": 2.8053057193756104 + }, + { + "auxiliary_loss_clip": 0.0109475, + "auxiliary_loss_mlp": 0.01085237, + "balance_loss_clip": 1.02063107, + "balance_loss_mlp": 1.00503325, + "epoch": 0.767991342511874, + "flos": 26756825063040.0, + "grad_norm": 1.6486376040787603, + "language_loss": 0.76136172, + "learning_rate": 5.384205425441344e-07, + "loss": 0.78316158, + "num_input_tokens_seen": 137433955, + "step": 6387, + "time_per_iteration": 2.871631383895874 + }, + { + "auxiliary_loss_clip": 0.01116832, + "auxiliary_loss_mlp": 0.0108443, + "balance_loss_clip": 1.02506387, + "balance_loss_mlp": 1.00427365, + "epoch": 0.7681115854025131, + "flos": 26359509749760.0, + "grad_norm": 1.6632230065901208, + "language_loss": 0.84290147, + "learning_rate": 5.378889253101537e-07, + "loss": 0.86491412, + "num_input_tokens_seen": 137454510, + "step": 6388, + "time_per_iteration": 2.786315441131592 + }, + { + "auxiliary_loss_clip": 0.01126208, + "auxiliary_loss_mlp": 0.01083058, + "balance_loss_clip": 1.02496636, + "balance_loss_mlp": 1.00299668, + "epoch": 0.7682318282931522, + "flos": 23257043314560.0, + "grad_norm": 1.6886118291531942, + "language_loss": 0.80734801, + "learning_rate": 5.373575298772617e-07, + "loss": 0.82944065, + "num_input_tokens_seen": 137473630, + "step": 6389, + "time_per_iteration": 2.7553179264068604 + }, + { + "auxiliary_loss_clip": 0.01107854, + "auxiliary_loss_mlp": 0.01079144, + "balance_loss_clip": 1.02039802, + "balance_loss_mlp": 1.00017953, + "epoch": 0.7683520711837912, + "flos": 70072457137920.0, + "grad_norm": 0.7609585605086023, + "language_loss": 0.61298341, + "learning_rate": 5.368263563260689e-07, + "loss": 0.63485336, + "num_input_tokens_seen": 137538765, + "step": 6390, + "time_per_iteration": 3.3081154823303223 + }, + { + "auxiliary_loss_clip": 0.01127725, + "auxiliary_loss_mlp": 0.01083411, + "balance_loss_clip": 1.02580452, + "balance_loss_mlp": 1.00325513, + "epoch": 0.7684723140744304, + "flos": 18624890332800.0, + "grad_norm": 1.5060697497569204, + "language_loss": 0.64293116, + "learning_rate": 5.362954047371537e-07, + "loss": 0.66504252, + "num_input_tokens_seen": 137557875, + "step": 6391, + "time_per_iteration": 2.7421252727508545 + }, + { + "auxiliary_loss_clip": 0.01100517, + "auxiliary_loss_mlp": 0.01084423, + "balance_loss_clip": 1.02166712, + "balance_loss_mlp": 1.00417161, + "epoch": 0.7685925569650695, + "flos": 27452989532160.0, + "grad_norm": 1.5851245177694022, + "language_loss": 0.71887493, + "learning_rate": 5.357646751910627e-07, + "loss": 0.74072433, + "num_input_tokens_seen": 137579055, + "step": 6392, + "time_per_iteration": 2.782784938812256 + }, + { + "auxiliary_loss_clip": 0.01117743, + "auxiliary_loss_mlp": 0.01084912, + "balance_loss_clip": 1.0251745, + "balance_loss_mlp": 1.00475597, + "epoch": 0.7687127998557085, + "flos": 24535714642560.0, + "grad_norm": 2.2985781003093426, + "language_loss": 0.79728079, + "learning_rate": 5.352341677683061e-07, + "loss": 0.81930733, + "num_input_tokens_seen": 137600355, + "step": 6393, + "time_per_iteration": 2.7766849994659424 + }, + { + "auxiliary_loss_clip": 0.0110878, + "auxiliary_loss_mlp": 0.0108345, + "balance_loss_clip": 1.02417386, + "balance_loss_mlp": 1.00334144, + "epoch": 0.7688330427463477, + "flos": 25155963717120.0, + "grad_norm": 2.6658254403686166, + "language_loss": 0.79080248, + "learning_rate": 5.347038825493617e-07, + "loss": 0.81272483, + "num_input_tokens_seen": 137621885, + "step": 6394, + "time_per_iteration": 3.6649343967437744 + }, + { + "auxiliary_loss_clip": 0.01115073, + "auxiliary_loss_mlp": 0.0108385, + "balance_loss_clip": 1.02483487, + "balance_loss_mlp": 1.00374174, + "epoch": 0.7689532856369867, + "flos": 21211284113280.0, + "grad_norm": 2.3877177070366544, + "language_loss": 0.68399, + "learning_rate": 5.341738196146732e-07, + "loss": 0.70597923, + "num_input_tokens_seen": 137640230, + "step": 6395, + "time_per_iteration": 2.703763961791992 + }, + { + "auxiliary_loss_clip": 0.01126582, + "auxiliary_loss_mlp": 0.01083788, + "balance_loss_clip": 1.02493691, + "balance_loss_mlp": 1.00358379, + "epoch": 0.7690735285276258, + "flos": 25119083427840.0, + "grad_norm": 1.8989046441066195, + "language_loss": 0.73435283, + "learning_rate": 5.336439790446503e-07, + "loss": 0.75645655, + "num_input_tokens_seen": 137659330, + "step": 6396, + "time_per_iteration": 3.5754024982452393 + }, + { + "auxiliary_loss_clip": 0.01110222, + "auxiliary_loss_mlp": 0.01083254, + "balance_loss_clip": 1.02506185, + "balance_loss_mlp": 1.00314569, + "epoch": 0.769193771418265, + "flos": 54744020640000.0, + "grad_norm": 1.8393461160251237, + "language_loss": 0.62393653, + "learning_rate": 5.331143609196711e-07, + "loss": 0.64587134, + "num_input_tokens_seen": 137683145, + "step": 6397, + "time_per_iteration": 3.8501083850860596 + }, + { + "auxiliary_loss_clip": 0.01125345, + "auxiliary_loss_mlp": 0.01084999, + "balance_loss_clip": 1.02502954, + "balance_loss_mlp": 1.00489068, + "epoch": 0.769314014308904, + "flos": 37341890115840.0, + "grad_norm": 4.647641820217336, + "language_loss": 0.77422166, + "learning_rate": 5.325849653200758e-07, + "loss": 0.79632509, + "num_input_tokens_seen": 137707095, + "step": 6398, + "time_per_iteration": 2.780928611755371 + }, + { + "auxiliary_loss_clip": 0.01135701, + "auxiliary_loss_mlp": 0.01084372, + "balance_loss_clip": 1.02657473, + "balance_loss_mlp": 1.0041678, + "epoch": 0.7694342571995431, + "flos": 20631686256000.0, + "grad_norm": 1.6295998089509731, + "language_loss": 0.76273763, + "learning_rate": 5.32055792326175e-07, + "loss": 0.78493834, + "num_input_tokens_seen": 137725520, + "step": 6399, + "time_per_iteration": 2.661323070526123 + }, + { + "auxiliary_loss_clip": 0.01115877, + "auxiliary_loss_mlp": 0.01084126, + "balance_loss_clip": 1.02439117, + "balance_loss_mlp": 1.00396943, + "epoch": 0.7695545000901821, + "flos": 24207706621440.0, + "grad_norm": 1.7104798746770176, + "language_loss": 0.72809315, + "learning_rate": 5.315268420182437e-07, + "loss": 0.7500931, + "num_input_tokens_seen": 137744195, + "step": 6400, + "time_per_iteration": 3.717414379119873 + }, + { + "auxiliary_loss_clip": 0.01109144, + "auxiliary_loss_mlp": 0.00872836, + "balance_loss_clip": 1.02530408, + "balance_loss_mlp": 1.00009465, + "epoch": 0.7696747429808213, + "flos": 28001273708160.0, + "grad_norm": 2.168310255841477, + "language_loss": 0.7652688, + "learning_rate": 5.309981144765221e-07, + "loss": 0.7850886, + "num_input_tokens_seen": 137764340, + "step": 6401, + "time_per_iteration": 2.8465771675109863 + }, + { + "auxiliary_loss_clip": 0.01100575, + "auxiliary_loss_mlp": 0.01083347, + "balance_loss_clip": 1.02490866, + "balance_loss_mlp": 1.00328577, + "epoch": 0.7697949858714603, + "flos": 11509550323200.0, + "grad_norm": 2.0569958097645467, + "language_loss": 0.75247139, + "learning_rate": 5.304696097812196e-07, + "loss": 0.77431059, + "num_input_tokens_seen": 137780940, + "step": 6402, + "time_per_iteration": 2.8093903064727783 + }, + { + "auxiliary_loss_clip": 0.01117496, + "auxiliary_loss_mlp": 0.01083893, + "balance_loss_clip": 1.02460015, + "balance_loss_mlp": 1.00364172, + "epoch": 0.7699152287620994, + "flos": 26688271956480.0, + "grad_norm": 3.3263851353702103, + "language_loss": 0.59736151, + "learning_rate": 5.299413280125078e-07, + "loss": 0.61937541, + "num_input_tokens_seen": 137799250, + "step": 6403, + "time_per_iteration": 2.7359068393707275 + }, + { + "auxiliary_loss_clip": 0.01120288, + "auxiliary_loss_mlp": 0.01083911, + "balance_loss_clip": 1.02638912, + "balance_loss_mlp": 1.00385034, + "epoch": 0.7700354716527386, + "flos": 16544944362240.0, + "grad_norm": 2.1628660655203285, + "language_loss": 0.72857857, + "learning_rate": 5.294132692505284e-07, + "loss": 0.7506206, + "num_input_tokens_seen": 137817660, + "step": 6404, + "time_per_iteration": 2.74849009513855 + }, + { + "auxiliary_loss_clip": 0.01097447, + "auxiliary_loss_mlp": 0.01084093, + "balance_loss_clip": 1.02228963, + "balance_loss_mlp": 1.0040319, + "epoch": 0.7701557145433776, + "flos": 19242733196160.0, + "grad_norm": 1.9691215409342844, + "language_loss": 0.79209805, + "learning_rate": 5.288854335753861e-07, + "loss": 0.8139134, + "num_input_tokens_seen": 137835920, + "step": 6405, + "time_per_iteration": 2.7638914585113525 + }, + { + "auxiliary_loss_clip": 0.01126535, + "auxiliary_loss_mlp": 0.01083575, + "balance_loss_clip": 1.0255506, + "balance_loss_mlp": 1.00346625, + "epoch": 0.7702759574340167, + "flos": 31685744211840.0, + "grad_norm": 1.618055250635434, + "language_loss": 0.75524038, + "learning_rate": 5.283578210671551e-07, + "loss": 0.77734149, + "num_input_tokens_seen": 137858160, + "step": 6406, + "time_per_iteration": 2.856628894805908 + }, + { + "auxiliary_loss_clip": 0.01117068, + "auxiliary_loss_mlp": 0.01084268, + "balance_loss_clip": 1.02491665, + "balance_loss_mlp": 1.00415969, + "epoch": 0.7703962003246558, + "flos": 16800089644800.0, + "grad_norm": 1.8791864536562022, + "language_loss": 0.76399183, + "learning_rate": 5.278304318058719e-07, + "loss": 0.7860052, + "num_input_tokens_seen": 137876015, + "step": 6407, + "time_per_iteration": 2.68681263923645 + }, + { + "auxiliary_loss_clip": 0.01083818, + "auxiliary_loss_mlp": 0.01084711, + "balance_loss_clip": 1.01919198, + "balance_loss_mlp": 1.00455451, + "epoch": 0.7705164432152949, + "flos": 35736072693120.0, + "grad_norm": 1.685142225847327, + "language_loss": 0.78827572, + "learning_rate": 5.273032658715411e-07, + "loss": 0.80996108, + "num_input_tokens_seen": 137898825, + "step": 6408, + "time_per_iteration": 3.008920431137085 + }, + { + "auxiliary_loss_clip": 0.01096909, + "auxiliary_loss_mlp": 0.01083926, + "balance_loss_clip": 1.02207434, + "balance_loss_mlp": 1.00381732, + "epoch": 0.7706366861059339, + "flos": 23365960329600.0, + "grad_norm": 1.8129755385779722, + "language_loss": 0.76725125, + "learning_rate": 5.267763233441347e-07, + "loss": 0.78905958, + "num_input_tokens_seen": 137919455, + "step": 6409, + "time_per_iteration": 2.8454222679138184 + }, + { + "auxiliary_loss_clip": 0.01125895, + "auxiliary_loss_mlp": 0.01083568, + "balance_loss_clip": 1.02593768, + "balance_loss_mlp": 1.00336444, + "epoch": 0.7707569289965731, + "flos": 22929897219840.0, + "grad_norm": 2.7923182569043603, + "language_loss": 0.69065541, + "learning_rate": 5.26249604303588e-07, + "loss": 0.71275008, + "num_input_tokens_seen": 137937960, + "step": 6410, + "time_per_iteration": 2.6744325160980225 + }, + { + "auxiliary_loss_clip": 0.01135821, + "auxiliary_loss_mlp": 0.01084792, + "balance_loss_clip": 1.02625942, + "balance_loss_mlp": 1.00468373, + "epoch": 0.7708771718872122, + "flos": 17420661941760.0, + "grad_norm": 2.0386501092091134, + "language_loss": 0.78501922, + "learning_rate": 5.257231088298057e-07, + "loss": 0.80722535, + "num_input_tokens_seen": 137956370, + "step": 6411, + "time_per_iteration": 2.654991865158081 + }, + { + "auxiliary_loss_clip": 0.01091041, + "auxiliary_loss_mlp": 0.01079286, + "balance_loss_clip": 1.0209353, + "balance_loss_mlp": 1.00032139, + "epoch": 0.7709974147778512, + "flos": 72241316248320.0, + "grad_norm": 1.0367426581503152, + "language_loss": 0.54022992, + "learning_rate": 5.25196837002655e-07, + "loss": 0.56193322, + "num_input_tokens_seen": 138016080, + "step": 6412, + "time_per_iteration": 3.3466405868530273 + }, + { + "auxiliary_loss_clip": 0.01118157, + "auxiliary_loss_mlp": 0.01084464, + "balance_loss_clip": 1.02568054, + "balance_loss_mlp": 1.0042119, + "epoch": 0.7711176576684904, + "flos": 39859694876160.0, + "grad_norm": 1.9382157807367424, + "language_loss": 0.68256521, + "learning_rate": 5.24670788901971e-07, + "loss": 0.70459139, + "num_input_tokens_seen": 138039170, + "step": 6413, + "time_per_iteration": 2.85915207862854 + }, + { + "auxiliary_loss_clip": 0.01119866, + "auxiliary_loss_mlp": 0.01085038, + "balance_loss_clip": 1.02717447, + "balance_loss_mlp": 1.0048337, + "epoch": 0.7712379005591294, + "flos": 36976391274240.0, + "grad_norm": 2.770468883171572, + "language_loss": 0.69181818, + "learning_rate": 5.241449646075557e-07, + "loss": 0.71386719, + "num_input_tokens_seen": 138062395, + "step": 6414, + "time_per_iteration": 2.8431546688079834 + }, + { + "auxiliary_loss_clip": 0.01128508, + "auxiliary_loss_mlp": 0.01084734, + "balance_loss_clip": 1.02598369, + "balance_loss_mlp": 1.00457811, + "epoch": 0.7713581434497685, + "flos": 22776773541120.0, + "grad_norm": 1.9818216776784858, + "language_loss": 0.7243576, + "learning_rate": 5.236193641991762e-07, + "loss": 0.74649, + "num_input_tokens_seen": 138080325, + "step": 6415, + "time_per_iteration": 2.721959114074707 + }, + { + "auxiliary_loss_clip": 0.01113096, + "auxiliary_loss_mlp": 0.01083948, + "balance_loss_clip": 1.02289665, + "balance_loss_mlp": 1.00383902, + "epoch": 0.7714783863404077, + "flos": 24097460803200.0, + "grad_norm": 2.0569520714283285, + "language_loss": 0.6986692, + "learning_rate": 5.23093987756565e-07, + "loss": 0.72063965, + "num_input_tokens_seen": 138099020, + "step": 6416, + "time_per_iteration": 2.7646381855010986 + }, + { + "auxiliary_loss_clip": 0.01109065, + "auxiliary_loss_mlp": 0.01084158, + "balance_loss_clip": 1.02402174, + "balance_loss_mlp": 1.00400186, + "epoch": 0.7715986292310467, + "flos": 21063655215360.0, + "grad_norm": 2.3506738049091314, + "language_loss": 0.75280708, + "learning_rate": 5.225688353594217e-07, + "loss": 0.77473938, + "num_input_tokens_seen": 138118650, + "step": 6417, + "time_per_iteration": 2.876171350479126 + }, + { + "auxiliary_loss_clip": 0.011182, + "auxiliary_loss_mlp": 0.00872833, + "balance_loss_clip": 1.0255475, + "balance_loss_mlp": 1.00007272, + "epoch": 0.7717188721216858, + "flos": 20594877793920.0, + "grad_norm": 1.9983265074291001, + "language_loss": 0.77572119, + "learning_rate": 5.220439070874108e-07, + "loss": 0.79563159, + "num_input_tokens_seen": 138137890, + "step": 6418, + "time_per_iteration": 2.7981173992156982 + }, + { + "auxiliary_loss_clip": 0.01126051, + "auxiliary_loss_mlp": 0.01085228, + "balance_loss_clip": 1.02653408, + "balance_loss_mlp": 1.00497639, + "epoch": 0.7718391150123249, + "flos": 26250951870720.0, + "grad_norm": 1.6240387376933514, + "language_loss": 0.71070284, + "learning_rate": 5.215192030201652e-07, + "loss": 0.73281562, + "num_input_tokens_seen": 138158880, + "step": 6419, + "time_per_iteration": 3.5117480754852295 + }, + { + "auxiliary_loss_clip": 0.01101871, + "auxiliary_loss_mlp": 0.01083546, + "balance_loss_clip": 1.02358925, + "balance_loss_mlp": 1.00348473, + "epoch": 0.771959357902964, + "flos": 22049762267520.0, + "grad_norm": 1.818611957994146, + "language_loss": 0.86066151, + "learning_rate": 5.209947232372798e-07, + "loss": 0.88251567, + "num_input_tokens_seen": 138176370, + "step": 6420, + "time_per_iteration": 2.7955403327941895 + }, + { + "auxiliary_loss_clip": 0.01127112, + "auxiliary_loss_mlp": 0.0087294, + "balance_loss_clip": 1.02598357, + "balance_loss_mlp": 1.00011778, + "epoch": 0.772079600793603, + "flos": 30446000248320.0, + "grad_norm": 1.7467048553954205, + "language_loss": 0.81472355, + "learning_rate": 5.204704678183196e-07, + "loss": 0.83472407, + "num_input_tokens_seen": 138195105, + "step": 6421, + "time_per_iteration": 3.736712694168091 + }, + { + "auxiliary_loss_clip": 0.0113697, + "auxiliary_loss_mlp": 0.01082784, + "balance_loss_clip": 1.02755487, + "balance_loss_mlp": 1.00267601, + "epoch": 0.7721998436842422, + "flos": 12969857750400.0, + "grad_norm": 2.2469775867948187, + "language_loss": 0.84927392, + "learning_rate": 5.19946436842813e-07, + "loss": 0.87147152, + "num_input_tokens_seen": 138212235, + "step": 6422, + "time_per_iteration": 3.617591381072998 + }, + { + "auxiliary_loss_clip": 0.01103361, + "auxiliary_loss_mlp": 0.01084723, + "balance_loss_clip": 1.02173352, + "balance_loss_mlp": 1.00456619, + "epoch": 0.7723200865748813, + "flos": 32635509678720.0, + "grad_norm": 1.6441830689219283, + "language_loss": 0.68044418, + "learning_rate": 5.194226303902546e-07, + "loss": 0.70232505, + "num_input_tokens_seen": 138231970, + "step": 6423, + "time_per_iteration": 2.8477582931518555 + }, + { + "auxiliary_loss_clip": 0.01116028, + "auxiliary_loss_mlp": 0.01083373, + "balance_loss_clip": 1.02446425, + "balance_loss_mlp": 1.00321639, + "epoch": 0.7724403294655203, + "flos": 21105707063040.0, + "grad_norm": 2.7517869244333175, + "language_loss": 0.71260697, + "learning_rate": 5.188990485401072e-07, + "loss": 0.73460096, + "num_input_tokens_seen": 138251175, + "step": 6424, + "time_per_iteration": 2.7719309329986572 + }, + { + "auxiliary_loss_clip": 0.01124376, + "auxiliary_loss_mlp": 0.01084881, + "balance_loss_clip": 1.02490234, + "balance_loss_mlp": 1.00467682, + "epoch": 0.7725605723561595, + "flos": 22090736707200.0, + "grad_norm": 2.5803268508544748, + "language_loss": 0.86103642, + "learning_rate": 5.183756913717954e-07, + "loss": 0.883129, + "num_input_tokens_seen": 138270950, + "step": 6425, + "time_per_iteration": 2.6892480850219727 + }, + { + "auxiliary_loss_clip": 0.01115515, + "auxiliary_loss_mlp": 0.01084107, + "balance_loss_clip": 1.02416241, + "balance_loss_mlp": 1.00399804, + "epoch": 0.7726808152467985, + "flos": 34495610457600.0, + "grad_norm": 1.785823022142219, + "language_loss": 0.73136032, + "learning_rate": 5.178525589647136e-07, + "loss": 0.75335658, + "num_input_tokens_seen": 138292590, + "step": 6426, + "time_per_iteration": 3.687784433364868 + }, + { + "auxiliary_loss_clip": 0.01119265, + "auxiliary_loss_mlp": 0.01084169, + "balance_loss_clip": 1.02623916, + "balance_loss_mlp": 1.00410843, + "epoch": 0.7728010581374376, + "flos": 22306344094080.0, + "grad_norm": 1.8100133453037623, + "language_loss": 0.78908563, + "learning_rate": 5.173296513982197e-07, + "loss": 0.81112003, + "num_input_tokens_seen": 138311115, + "step": 6427, + "time_per_iteration": 2.7253854274749756 + }, + { + "auxiliary_loss_clip": 0.01110431, + "auxiliary_loss_mlp": 0.0108445, + "balance_loss_clip": 1.02633178, + "balance_loss_mlp": 1.00419879, + "epoch": 0.7729213010280768, + "flos": 27126453968640.0, + "grad_norm": 2.7108309038505825, + "language_loss": 0.64886487, + "learning_rate": 5.168069687516398e-07, + "loss": 0.67081368, + "num_input_tokens_seen": 138330885, + "step": 6428, + "time_per_iteration": 2.7741546630859375 + }, + { + "auxiliary_loss_clip": 0.0111587, + "auxiliary_loss_mlp": 0.01085785, + "balance_loss_clip": 1.02492952, + "balance_loss_mlp": 1.00567651, + "epoch": 0.7730415439187158, + "flos": 18150223080960.0, + "grad_norm": 1.853862662871175, + "language_loss": 0.71476746, + "learning_rate": 5.16284511104263e-07, + "loss": 0.73678398, + "num_input_tokens_seen": 138350020, + "step": 6429, + "time_per_iteration": 2.7219724655151367 + }, + { + "auxiliary_loss_clip": 0.01108653, + "auxiliary_loss_mlp": 0.01085325, + "balance_loss_clip": 1.02287805, + "balance_loss_mlp": 1.00502586, + "epoch": 0.7731617868093549, + "flos": 11947480940160.0, + "grad_norm": 2.5821177501016193, + "language_loss": 0.80921811, + "learning_rate": 5.157622785353457e-07, + "loss": 0.83115786, + "num_input_tokens_seen": 138368135, + "step": 6430, + "time_per_iteration": 2.7238476276397705 + }, + { + "auxiliary_loss_clip": 0.01107768, + "auxiliary_loss_mlp": 0.01079058, + "balance_loss_clip": 1.02037477, + "balance_loss_mlp": 1.00009429, + "epoch": 0.7732820296999939, + "flos": 64201027069440.0, + "grad_norm": 0.6427664132425537, + "language_loss": 0.60404062, + "learning_rate": 5.152402711241113e-07, + "loss": 0.62590885, + "num_input_tokens_seen": 138436040, + "step": 6431, + "time_per_iteration": 3.3154795169830322 + }, + { + "auxiliary_loss_clip": 0.01108055, + "auxiliary_loss_mlp": 0.01083923, + "balance_loss_clip": 1.02426386, + "balance_loss_mlp": 1.00386214, + "epoch": 0.7734022725906331, + "flos": 25302191984640.0, + "grad_norm": 1.6939881523933857, + "language_loss": 0.83068639, + "learning_rate": 5.147184889497465e-07, + "loss": 0.85260618, + "num_input_tokens_seen": 138455510, + "step": 6432, + "time_per_iteration": 2.8369011878967285 + }, + { + "auxiliary_loss_clip": 0.01100143, + "auxiliary_loss_mlp": 0.01084288, + "balance_loss_clip": 1.02374959, + "balance_loss_mlp": 1.00398898, + "epoch": 0.7735225154812722, + "flos": 17347440067200.0, + "grad_norm": 2.435933602207448, + "language_loss": 0.79984987, + "learning_rate": 5.141969320914072e-07, + "loss": 0.82169414, + "num_input_tokens_seen": 138473015, + "step": 6433, + "time_per_iteration": 2.780555486679077 + }, + { + "auxiliary_loss_clip": 0.0113568, + "auxiliary_loss_mlp": 0.01084359, + "balance_loss_clip": 1.02620566, + "balance_loss_mlp": 1.00410748, + "epoch": 0.7736427583719112, + "flos": 32630086725120.0, + "grad_norm": 2.880217055476522, + "language_loss": 0.62024522, + "learning_rate": 5.136756006282113e-07, + "loss": 0.64244556, + "num_input_tokens_seen": 138491680, + "step": 6434, + "time_per_iteration": 2.694459915161133 + }, + { + "auxiliary_loss_clip": 0.01136135, + "auxiliary_loss_mlp": 0.01084651, + "balance_loss_clip": 1.0262264, + "balance_loss_mlp": 1.00449455, + "epoch": 0.7737630012625504, + "flos": 19860073269120.0, + "grad_norm": 2.386683564007739, + "language_loss": 0.84867674, + "learning_rate": 5.131544946392446e-07, + "loss": 0.8708846, + "num_input_tokens_seen": 138506960, + "step": 6435, + "time_per_iteration": 2.670276641845703 + }, + { + "auxiliary_loss_clip": 0.01110834, + "auxiliary_loss_mlp": 0.01084048, + "balance_loss_clip": 1.02151442, + "balance_loss_mlp": 1.00389194, + "epoch": 0.7738832441531894, + "flos": 36022639397760.0, + "grad_norm": 2.15765648735234, + "language_loss": 0.63825262, + "learning_rate": 5.126336142035592e-07, + "loss": 0.66020143, + "num_input_tokens_seen": 138526995, + "step": 6436, + "time_per_iteration": 2.835592031478882 + }, + { + "auxiliary_loss_clip": 0.0111684, + "auxiliary_loss_mlp": 0.01083438, + "balance_loss_clip": 1.02422869, + "balance_loss_mlp": 1.00323462, + "epoch": 0.7740034870438285, + "flos": 13405274415360.0, + "grad_norm": 2.5654301066100307, + "language_loss": 0.71835816, + "learning_rate": 5.121129594001721e-07, + "loss": 0.74036098, + "num_input_tokens_seen": 138541260, + "step": 6437, + "time_per_iteration": 2.6543619632720947 + }, + { + "auxiliary_loss_clip": 0.01125882, + "auxiliary_loss_mlp": 0.01085803, + "balance_loss_clip": 1.02628827, + "balance_loss_mlp": 1.00569475, + "epoch": 0.7741237299344677, + "flos": 22086714384000.0, + "grad_norm": 1.6786883131334607, + "language_loss": 0.81104338, + "learning_rate": 5.115925303080661e-07, + "loss": 0.83316022, + "num_input_tokens_seen": 138560970, + "step": 6438, + "time_per_iteration": 2.747093677520752 + }, + { + "auxiliary_loss_clip": 0.01100389, + "auxiliary_loss_mlp": 0.01085322, + "balance_loss_clip": 1.0247457, + "balance_loss_mlp": 1.00516534, + "epoch": 0.7742439728251067, + "flos": 19864777950720.0, + "grad_norm": 2.054250720763044, + "language_loss": 0.7926451, + "learning_rate": 5.110723270061899e-07, + "loss": 0.81450224, + "num_input_tokens_seen": 138577460, + "step": 6439, + "time_per_iteration": 2.698512077331543 + }, + { + "auxiliary_loss_clip": 0.01134457, + "auxiliary_loss_mlp": 0.01083813, + "balance_loss_clip": 1.02543819, + "balance_loss_mlp": 1.00375175, + "epoch": 0.7743642157157458, + "flos": 16690167048960.0, + "grad_norm": 2.4630471721247402, + "language_loss": 0.7955215, + "learning_rate": 5.105523495734572e-07, + "loss": 0.8177042, + "num_input_tokens_seen": 138594860, + "step": 6440, + "time_per_iteration": 2.6077823638916016 + }, + { + "auxiliary_loss_clip": 0.01134974, + "auxiliary_loss_mlp": 0.01083111, + "balance_loss_clip": 1.02523398, + "balance_loss_mlp": 1.00300229, + "epoch": 0.7744844586063849, + "flos": 20304360593280.0, + "grad_norm": 1.485410213335786, + "language_loss": 0.75117695, + "learning_rate": 5.100325980887499e-07, + "loss": 0.77335787, + "num_input_tokens_seen": 138614785, + "step": 6441, + "time_per_iteration": 2.6350038051605225 + }, + { + "auxiliary_loss_clip": 0.01101902, + "auxiliary_loss_mlp": 0.01084411, + "balance_loss_clip": 1.0252707, + "balance_loss_mlp": 1.00430274, + "epoch": 0.774604701497024, + "flos": 22966705681920.0, + "grad_norm": 1.7457601175330884, + "language_loss": 0.8306812, + "learning_rate": 5.095130726309116e-07, + "loss": 0.85254437, + "num_input_tokens_seen": 138634960, + "step": 6442, + "time_per_iteration": 2.7542121410369873 + }, + { + "auxiliary_loss_clip": 0.01115784, + "auxiliary_loss_mlp": 0.01078847, + "balance_loss_clip": 1.02045918, + "balance_loss_mlp": 0.99988312, + "epoch": 0.774724944387663, + "flos": 60288523073280.0, + "grad_norm": 0.7904549747018458, + "language_loss": 0.59077018, + "learning_rate": 5.089937732787559e-07, + "loss": 0.61271656, + "num_input_tokens_seen": 138699520, + "step": 6443, + "time_per_iteration": 3.2381997108459473 + }, + { + "auxiliary_loss_clip": 0.01109844, + "auxiliary_loss_mlp": 0.01084386, + "balance_loss_clip": 1.02518737, + "balance_loss_mlp": 1.00413477, + "epoch": 0.7748451872783022, + "flos": 26761026954240.0, + "grad_norm": 2.3064779957408783, + "language_loss": 0.66624236, + "learning_rate": 5.084747001110592e-07, + "loss": 0.68818462, + "num_input_tokens_seen": 138719145, + "step": 6444, + "time_per_iteration": 3.7212319374084473 + }, + { + "auxiliary_loss_clip": 0.01120741, + "auxiliary_loss_mlp": 0.00872878, + "balance_loss_clip": 1.02240777, + "balance_loss_mlp": 1.00014031, + "epoch": 0.7749654301689413, + "flos": 30338627518080.0, + "grad_norm": 1.5471397167384469, + "language_loss": 0.69938713, + "learning_rate": 5.07955853206564e-07, + "loss": 0.71932328, + "num_input_tokens_seen": 138743850, + "step": 6445, + "time_per_iteration": 2.811314105987549 + }, + { + "auxiliary_loss_clip": 0.01126291, + "auxiliary_loss_mlp": 0.01083936, + "balance_loss_clip": 1.02529407, + "balance_loss_mlp": 1.0038271, + "epoch": 0.7750856730595803, + "flos": 43179851687040.0, + "grad_norm": 1.5020676519060883, + "language_loss": 0.70913398, + "learning_rate": 5.074372326439807e-07, + "loss": 0.73123622, + "num_input_tokens_seen": 138766860, + "step": 6446, + "time_per_iteration": 3.7552108764648438 + }, + { + "auxiliary_loss_clip": 0.01099092, + "auxiliary_loss_mlp": 0.01084141, + "balance_loss_clip": 1.02207172, + "balance_loss_mlp": 1.00393677, + "epoch": 0.7752059159502195, + "flos": 17640040256640.0, + "grad_norm": 2.077867674136481, + "language_loss": 0.73346382, + "learning_rate": 5.069188385019814e-07, + "loss": 0.75529623, + "num_input_tokens_seen": 138784560, + "step": 6447, + "time_per_iteration": 3.7441203594207764 + }, + { + "auxiliary_loss_clip": 0.01099724, + "auxiliary_loss_mlp": 0.01083819, + "balance_loss_clip": 1.02382267, + "balance_loss_mlp": 1.00371051, + "epoch": 0.7753261588408585, + "flos": 12677688524160.0, + "grad_norm": 2.9158455438839797, + "language_loss": 0.61141455, + "learning_rate": 5.064006708592077e-07, + "loss": 0.63324994, + "num_input_tokens_seen": 138800805, + "step": 6448, + "time_per_iteration": 2.8287246227264404 + }, + { + "auxiliary_loss_clip": 0.01109404, + "auxiliary_loss_mlp": 0.01083773, + "balance_loss_clip": 1.02458024, + "balance_loss_mlp": 1.0037117, + "epoch": 0.7754464017314976, + "flos": 16690741666560.0, + "grad_norm": 2.2251686474744297, + "language_loss": 0.75370765, + "learning_rate": 5.058827297942641e-07, + "loss": 0.77563941, + "num_input_tokens_seen": 138815910, + "step": 6449, + "time_per_iteration": 2.727684736251831 + }, + { + "auxiliary_loss_clip": 0.01102811, + "auxiliary_loss_mlp": 0.0108384, + "balance_loss_clip": 1.02650094, + "balance_loss_mlp": 1.00377929, + "epoch": 0.7755666446221368, + "flos": 19718944732800.0, + "grad_norm": 1.7717549652775726, + "language_loss": 0.75058401, + "learning_rate": 5.053650153857237e-07, + "loss": 0.77245057, + "num_input_tokens_seen": 138834920, + "step": 6450, + "time_per_iteration": 2.7177305221557617 + }, + { + "auxiliary_loss_clip": 0.01125327, + "auxiliary_loss_mlp": 0.01083424, + "balance_loss_clip": 1.0253191, + "balance_loss_mlp": 1.003268, + "epoch": 0.7756868875127758, + "flos": 18693623007360.0, + "grad_norm": 1.5914107555954649, + "language_loss": 0.69840699, + "learning_rate": 5.048475277121214e-07, + "loss": 0.72049451, + "num_input_tokens_seen": 138852135, + "step": 6451, + "time_per_iteration": 3.5901386737823486 + }, + { + "auxiliary_loss_clip": 0.0112551, + "auxiliary_loss_mlp": 0.01082557, + "balance_loss_clip": 1.02514219, + "balance_loss_mlp": 1.0024488, + "epoch": 0.7758071304034149, + "flos": 28404191543040.0, + "grad_norm": 1.6230017682631614, + "language_loss": 0.76975018, + "learning_rate": 5.043302668519598e-07, + "loss": 0.7918309, + "num_input_tokens_seen": 138871470, + "step": 6452, + "time_per_iteration": 2.7058515548706055 + }, + { + "auxiliary_loss_clip": 0.01127048, + "auxiliary_loss_mlp": 0.01083567, + "balance_loss_clip": 1.02576566, + "balance_loss_mlp": 1.00350618, + "epoch": 0.775927373294054, + "flos": 20595344670720.0, + "grad_norm": 1.7496167438058252, + "language_loss": 0.72219914, + "learning_rate": 5.038132328837079e-07, + "loss": 0.74430525, + "num_input_tokens_seen": 138889860, + "step": 6453, + "time_per_iteration": 2.6641292572021484 + }, + { + "auxiliary_loss_clip": 0.01128564, + "auxiliary_loss_mlp": 0.01083475, + "balance_loss_clip": 1.02746344, + "balance_loss_mlp": 1.00341439, + "epoch": 0.7760476161846931, + "flos": 22526368853760.0, + "grad_norm": 2.094825070266447, + "language_loss": 0.73722571, + "learning_rate": 5.032964258857993e-07, + "loss": 0.75934613, + "num_input_tokens_seen": 138909955, + "step": 6454, + "time_per_iteration": 2.7351105213165283 + }, + { + "auxiliary_loss_clip": 0.01126776, + "auxiliary_loss_mlp": 0.01084685, + "balance_loss_clip": 1.02515984, + "balance_loss_mlp": 1.0044812, + "epoch": 0.7761678590753321, + "flos": 48651488403840.0, + "grad_norm": 1.474655619281702, + "language_loss": 0.68449718, + "learning_rate": 5.027798459366329e-07, + "loss": 0.70661175, + "num_input_tokens_seen": 138935320, + "step": 6455, + "time_per_iteration": 2.9305036067962646 + }, + { + "auxiliary_loss_clip": 0.01127962, + "auxiliary_loss_mlp": 0.01084358, + "balance_loss_clip": 1.02637005, + "balance_loss_mlp": 1.00410593, + "epoch": 0.7762881019659713, + "flos": 26177047637760.0, + "grad_norm": 1.4125131864688913, + "language_loss": 0.63711929, + "learning_rate": 5.02263493114573e-07, + "loss": 0.65924245, + "num_input_tokens_seen": 138957115, + "step": 6456, + "time_per_iteration": 2.713771343231201 + }, + { + "auxiliary_loss_clip": 0.01135178, + "auxiliary_loss_mlp": 0.01082781, + "balance_loss_clip": 1.02584291, + "balance_loss_mlp": 1.00272, + "epoch": 0.7764083448566104, + "flos": 20588341518720.0, + "grad_norm": 2.2351296141039505, + "language_loss": 0.77343082, + "learning_rate": 5.017473674979502e-07, + "loss": 0.79561043, + "num_input_tokens_seen": 138973140, + "step": 6457, + "time_per_iteration": 2.6788170337677 + }, + { + "auxiliary_loss_clip": 0.0107569, + "auxiliary_loss_mlp": 0.01079314, + "balance_loss_clip": 1.01414895, + "balance_loss_mlp": 1.0003494, + "epoch": 0.7765285877472494, + "flos": 67293078560640.0, + "grad_norm": 0.7597991210516312, + "language_loss": 0.58348823, + "learning_rate": 5.01231469165061e-07, + "loss": 0.60503829, + "num_input_tokens_seen": 139028965, + "step": 6458, + "time_per_iteration": 3.2353320121765137 + }, + { + "auxiliary_loss_clip": 0.01107594, + "auxiliary_loss_mlp": 0.01078966, + "balance_loss_clip": 1.0204066, + "balance_loss_mlp": 1.00000215, + "epoch": 0.7766488306378886, + "flos": 61344476121600.0, + "grad_norm": 0.8281234960197797, + "language_loss": 0.56931829, + "learning_rate": 5.007157981941663e-07, + "loss": 0.59118384, + "num_input_tokens_seen": 139094325, + "step": 6459, + "time_per_iteration": 3.3896634578704834 + }, + { + "auxiliary_loss_clip": 0.01100059, + "auxiliary_loss_mlp": 0.01078911, + "balance_loss_clip": 1.02078557, + "balance_loss_mlp": 0.99994689, + "epoch": 0.7767690735285276, + "flos": 62946199393920.0, + "grad_norm": 0.8787037621085116, + "language_loss": 0.6749568, + "learning_rate": 5.002003546634928e-07, + "loss": 0.69674647, + "num_input_tokens_seen": 139150425, + "step": 6460, + "time_per_iteration": 3.271818161010742 + }, + { + "auxiliary_loss_clip": 0.01091424, + "auxiliary_loss_mlp": 0.01083128, + "balance_loss_clip": 1.01985919, + "balance_loss_mlp": 1.00301981, + "epoch": 0.7768893164191667, + "flos": 20886400575360.0, + "grad_norm": 1.6538416295696876, + "language_loss": 0.76143014, + "learning_rate": 4.996851386512331e-07, + "loss": 0.78317571, + "num_input_tokens_seen": 139169130, + "step": 6461, + "time_per_iteration": 2.77785325050354 + }, + { + "auxiliary_loss_clip": 0.01114723, + "auxiliary_loss_mlp": 0.01083525, + "balance_loss_clip": 1.02398503, + "balance_loss_mlp": 1.00336862, + "epoch": 0.7770095593098058, + "flos": 20704584908160.0, + "grad_norm": 1.7088980579551771, + "language_loss": 0.83135903, + "learning_rate": 4.991701502355444e-07, + "loss": 0.85334146, + "num_input_tokens_seen": 139189595, + "step": 6462, + "time_per_iteration": 2.773212194442749 + }, + { + "auxiliary_loss_clip": 0.01126156, + "auxiliary_loss_mlp": 0.01083687, + "balance_loss_clip": 1.02483988, + "balance_loss_mlp": 1.00362635, + "epoch": 0.7771298022004449, + "flos": 24717709877760.0, + "grad_norm": 1.6423007119429829, + "language_loss": 0.75908303, + "learning_rate": 4.986553894945518e-07, + "loss": 0.78118145, + "num_input_tokens_seen": 139210805, + "step": 6463, + "time_per_iteration": 2.7516815662384033 + }, + { + "auxiliary_loss_clip": 0.01081395, + "auxiliary_loss_mlp": 0.01083726, + "balance_loss_clip": 1.02330136, + "balance_loss_mlp": 1.00376034, + "epoch": 0.777250045091084, + "flos": 25009232659200.0, + "grad_norm": 1.8535333659158486, + "language_loss": 0.86077917, + "learning_rate": 4.981408565063416e-07, + "loss": 0.88243043, + "num_input_tokens_seen": 139230750, + "step": 6464, + "time_per_iteration": 2.7937073707580566 + }, + { + "auxiliary_loss_clip": 0.01134874, + "auxiliary_loss_mlp": 0.01084858, + "balance_loss_clip": 1.0254606, + "balance_loss_mlp": 1.00460625, + "epoch": 0.777370287981723, + "flos": 20119887319680.0, + "grad_norm": 1.7826758512488485, + "language_loss": 0.76083797, + "learning_rate": 4.976265513489701e-07, + "loss": 0.78303528, + "num_input_tokens_seen": 139250720, + "step": 6465, + "time_per_iteration": 2.6434972286224365 + }, + { + "auxiliary_loss_clip": 0.01128064, + "auxiliary_loss_mlp": 0.01083791, + "balance_loss_clip": 1.02646279, + "balance_loss_mlp": 1.00368249, + "epoch": 0.7774905308723622, + "flos": 21718809331200.0, + "grad_norm": 1.670351801888922, + "language_loss": 0.80187678, + "learning_rate": 4.971124741004562e-07, + "loss": 0.82399535, + "num_input_tokens_seen": 139269720, + "step": 6466, + "time_per_iteration": 2.6827898025512695 + }, + { + "auxiliary_loss_clip": 0.01124526, + "auxiliary_loss_mlp": 0.01083841, + "balance_loss_clip": 1.02459514, + "balance_loss_mlp": 1.00378025, + "epoch": 0.7776107737630013, + "flos": 16034115093120.0, + "grad_norm": 1.696227794866029, + "language_loss": 0.76043069, + "learning_rate": 4.965986248387846e-07, + "loss": 0.78251439, + "num_input_tokens_seen": 139288035, + "step": 6467, + "time_per_iteration": 2.7343411445617676 + }, + { + "auxiliary_loss_clip": 0.01116321, + "auxiliary_loss_mlp": 0.01083868, + "balance_loss_clip": 1.02376354, + "balance_loss_mlp": 1.00366402, + "epoch": 0.7777310166536403, + "flos": 24790895838720.0, + "grad_norm": 2.0010422181635286, + "language_loss": 0.77059424, + "learning_rate": 4.960850036419073e-07, + "loss": 0.7925961, + "num_input_tokens_seen": 139307135, + "step": 6468, + "time_per_iteration": 2.779848575592041 + }, + { + "auxiliary_loss_clip": 0.01112805, + "auxiliary_loss_mlp": 0.0108485, + "balance_loss_clip": 1.02247477, + "balance_loss_mlp": 1.00464606, + "epoch": 0.7778512595442795, + "flos": 17272530253440.0, + "grad_norm": 2.0433512475824704, + "language_loss": 0.7853936, + "learning_rate": 4.955716105877378e-07, + "loss": 0.80737019, + "num_input_tokens_seen": 139325905, + "step": 6469, + "time_per_iteration": 2.7143561840057373 + }, + { + "auxiliary_loss_clip": 0.01125983, + "auxiliary_loss_mlp": 0.00872899, + "balance_loss_clip": 1.02472866, + "balance_loss_mlp": 1.00010169, + "epoch": 0.7779715024349185, + "flos": 17748418567680.0, + "grad_norm": 1.6052615429793002, + "language_loss": 0.83046079, + "learning_rate": 4.950584457541598e-07, + "loss": 0.85044956, + "num_input_tokens_seen": 139344370, + "step": 6470, + "time_per_iteration": 3.569538116455078 + }, + { + "auxiliary_loss_clip": 0.01127069, + "auxiliary_loss_mlp": 0.01084701, + "balance_loss_clip": 1.02568793, + "balance_loss_mlp": 1.00464022, + "epoch": 0.7780917453255576, + "flos": 24316875031680.0, + "grad_norm": 1.307718043016706, + "language_loss": 0.82005882, + "learning_rate": 4.945455092190183e-07, + "loss": 0.84217656, + "num_input_tokens_seen": 139365625, + "step": 6471, + "time_per_iteration": 2.738664150238037 + }, + { + "auxiliary_loss_clip": 0.01115509, + "auxiliary_loss_mlp": 0.01078946, + "balance_loss_clip": 1.02025461, + "balance_loss_mlp": 0.9999817, + "epoch": 0.7782119882161967, + "flos": 56364601530240.0, + "grad_norm": 0.6824996174199143, + "language_loss": 0.55995619, + "learning_rate": 4.940328010601271e-07, + "loss": 0.58190072, + "num_input_tokens_seen": 139430540, + "step": 6472, + "time_per_iteration": 4.23348331451416 + }, + { + "auxiliary_loss_clip": 0.01102952, + "auxiliary_loss_mlp": 0.01085545, + "balance_loss_clip": 1.02760601, + "balance_loss_mlp": 1.00538826, + "epoch": 0.7783322311068358, + "flos": 46789986994560.0, + "grad_norm": 1.7632928286675082, + "language_loss": 0.76834649, + "learning_rate": 4.935203213552621e-07, + "loss": 0.79023147, + "num_input_tokens_seen": 139454280, + "step": 6473, + "time_per_iteration": 3.8373818397521973 + }, + { + "auxiliary_loss_clip": 0.01112328, + "auxiliary_loss_mlp": 0.01084214, + "balance_loss_clip": 1.02187419, + "balance_loss_mlp": 1.00405788, + "epoch": 0.7784524739974749, + "flos": 19057864872960.0, + "grad_norm": 2.0214089422399404, + "language_loss": 0.67023706, + "learning_rate": 4.930080701821662e-07, + "loss": 0.69220251, + "num_input_tokens_seen": 139471745, + "step": 6474, + "time_per_iteration": 2.712014675140381 + }, + { + "auxiliary_loss_clip": 0.01116279, + "auxiliary_loss_mlp": 0.01084095, + "balance_loss_clip": 1.02357376, + "balance_loss_mlp": 1.00393856, + "epoch": 0.778572716888114, + "flos": 24791111320320.0, + "grad_norm": 2.518839861341108, + "language_loss": 0.77261174, + "learning_rate": 4.92496047618548e-07, + "loss": 0.79461539, + "num_input_tokens_seen": 139491505, + "step": 6475, + "time_per_iteration": 2.7848832607269287 + }, + { + "auxiliary_loss_clip": 0.01125976, + "auxiliary_loss_mlp": 0.01082916, + "balance_loss_clip": 1.02596486, + "balance_loss_mlp": 1.00280738, + "epoch": 0.7786929597787531, + "flos": 20078086867200.0, + "grad_norm": 1.9329019197476678, + "language_loss": 0.77932429, + "learning_rate": 4.919842537420811e-07, + "loss": 0.80141324, + "num_input_tokens_seen": 139508620, + "step": 6476, + "time_per_iteration": 2.614778995513916 + }, + { + "auxiliary_loss_clip": 0.0111763, + "auxiliary_loss_mlp": 0.01083957, + "balance_loss_clip": 1.02618742, + "balance_loss_mlp": 1.00380087, + "epoch": 0.7788132026693921, + "flos": 21872220318720.0, + "grad_norm": 1.9323896433814618, + "language_loss": 0.79401791, + "learning_rate": 4.91472688630404e-07, + "loss": 0.81603378, + "num_input_tokens_seen": 139529360, + "step": 6477, + "time_per_iteration": 3.7031474113464355 + }, + { + "auxiliary_loss_clip": 0.01135071, + "auxiliary_loss_mlp": 0.0108441, + "balance_loss_clip": 1.02611792, + "balance_loss_mlp": 1.00430155, + "epoch": 0.7789334455600313, + "flos": 11181937351680.0, + "grad_norm": 1.7398391355237692, + "language_loss": 0.74201387, + "learning_rate": 4.909613523611202e-07, + "loss": 0.76420867, + "num_input_tokens_seen": 139546240, + "step": 6478, + "time_per_iteration": 2.564915657043457 + }, + { + "auxiliary_loss_clip": 0.01084106, + "auxiliary_loss_mlp": 0.00872954, + "balance_loss_clip": 1.02471542, + "balance_loss_mlp": 1.00011563, + "epoch": 0.7790536884506704, + "flos": 28695427015680.0, + "grad_norm": 1.8668821394920323, + "language_loss": 0.74474329, + "learning_rate": 4.904502450117991e-07, + "loss": 0.76431388, + "num_input_tokens_seen": 139567200, + "step": 6479, + "time_per_iteration": 2.876951217651367 + }, + { + "auxiliary_loss_clip": 0.01109619, + "auxiliary_loss_mlp": 0.01083728, + "balance_loss_clip": 1.0214144, + "balance_loss_mlp": 1.00361907, + "epoch": 0.7791739313413094, + "flos": 11072302064640.0, + "grad_norm": 2.116546643885583, + "language_loss": 0.7279985, + "learning_rate": 4.899393666599762e-07, + "loss": 0.74993193, + "num_input_tokens_seen": 139583775, + "step": 6480, + "time_per_iteration": 2.720874071121216 + }, + { + "auxiliary_loss_clip": 0.01134045, + "auxiliary_loss_mlp": 0.01083687, + "balance_loss_clip": 1.02471137, + "balance_loss_mlp": 1.00353062, + "epoch": 0.7792941742319486, + "flos": 14679276975360.0, + "grad_norm": 2.3144351169655284, + "language_loss": 0.72739327, + "learning_rate": 4.894287173831506e-07, + "loss": 0.74957061, + "num_input_tokens_seen": 139599735, + "step": 6481, + "time_per_iteration": 2.630201816558838 + }, + { + "auxiliary_loss_clip": 0.01115683, + "auxiliary_loss_mlp": 0.01082898, + "balance_loss_clip": 1.02361763, + "balance_loss_mlp": 1.00278926, + "epoch": 0.7794144171225876, + "flos": 23258874908160.0, + "grad_norm": 2.1859078256702666, + "language_loss": 0.84317744, + "learning_rate": 4.889182972587877e-07, + "loss": 0.86516333, + "num_input_tokens_seen": 139619030, + "step": 6482, + "time_per_iteration": 2.7563390731811523 + }, + { + "auxiliary_loss_clip": 0.01111922, + "auxiliary_loss_mlp": 0.01085274, + "balance_loss_clip": 1.02747929, + "balance_loss_mlp": 1.00511777, + "epoch": 0.7795346600132267, + "flos": 21507080613120.0, + "grad_norm": 1.8208585778533724, + "language_loss": 0.66247559, + "learning_rate": 4.884081063643177e-07, + "loss": 0.68444747, + "num_input_tokens_seen": 139637690, + "step": 6483, + "time_per_iteration": 2.7061331272125244 + }, + { + "auxiliary_loss_clip": 0.01093589, + "auxiliary_loss_mlp": 0.0107906, + "balance_loss_clip": 1.01563871, + "balance_loss_mlp": 1.00009573, + "epoch": 0.7796549029038659, + "flos": 70052273694720.0, + "grad_norm": 0.8758466745578823, + "language_loss": 0.52532673, + "learning_rate": 4.878981447771353e-07, + "loss": 0.54705316, + "num_input_tokens_seen": 139692070, + "step": 6484, + "time_per_iteration": 3.325582504272461 + }, + { + "auxiliary_loss_clip": 0.01106855, + "auxiliary_loss_mlp": 0.01083424, + "balance_loss_clip": 1.02370489, + "balance_loss_mlp": 1.00322032, + "epoch": 0.7797751457945049, + "flos": 23989405714560.0, + "grad_norm": 1.8046939072394217, + "language_loss": 0.732301, + "learning_rate": 4.873884125746035e-07, + "loss": 0.7542038, + "num_input_tokens_seen": 139713745, + "step": 6485, + "time_per_iteration": 2.7736966609954834 + }, + { + "auxiliary_loss_clip": 0.01116385, + "auxiliary_loss_mlp": 0.01083598, + "balance_loss_clip": 1.02448273, + "balance_loss_mlp": 1.00348902, + "epoch": 0.779895388685144, + "flos": 22674751937280.0, + "grad_norm": 2.2138908119487706, + "language_loss": 0.72717828, + "learning_rate": 4.868789098340456e-07, + "loss": 0.74917817, + "num_input_tokens_seen": 139731650, + "step": 6486, + "time_per_iteration": 2.7951316833496094 + }, + { + "auxiliary_loss_clip": 0.01106477, + "auxiliary_loss_mlp": 0.01083734, + "balance_loss_clip": 1.02367926, + "balance_loss_mlp": 1.00367284, + "epoch": 0.7800156315757831, + "flos": 23768698596480.0, + "grad_norm": 2.699948216137916, + "language_loss": 0.73605478, + "learning_rate": 4.863696366327543e-07, + "loss": 0.75795686, + "num_input_tokens_seen": 139750820, + "step": 6487, + "time_per_iteration": 2.809246063232422 + }, + { + "auxiliary_loss_clip": 0.01125636, + "auxiliary_loss_mlp": 0.01084283, + "balance_loss_clip": 1.02448106, + "balance_loss_mlp": 1.00417447, + "epoch": 0.7801358744664222, + "flos": 26429714881920.0, + "grad_norm": 1.6718767020172471, + "language_loss": 0.77984375, + "learning_rate": 4.85860593047986e-07, + "loss": 0.80194294, + "num_input_tokens_seen": 139770885, + "step": 6488, + "time_per_iteration": 2.694589138031006 + }, + { + "auxiliary_loss_clip": 0.0110721, + "auxiliary_loss_mlp": 0.01083127, + "balance_loss_clip": 1.02310634, + "balance_loss_mlp": 1.00301862, + "epoch": 0.7802561173570612, + "flos": 26322162583680.0, + "grad_norm": 1.5941507901813783, + "language_loss": 0.74635327, + "learning_rate": 4.853517791569613e-07, + "loss": 0.7682566, + "num_input_tokens_seen": 139793065, + "step": 6489, + "time_per_iteration": 2.8645517826080322 + }, + { + "auxiliary_loss_clip": 0.01115599, + "auxiliary_loss_mlp": 0.00872959, + "balance_loss_clip": 1.02312732, + "balance_loss_mlp": 1.00006735, + "epoch": 0.7803763602477004, + "flos": 40333751596800.0, + "grad_norm": 1.6021685775862642, + "language_loss": 0.66070467, + "learning_rate": 4.848431950368684e-07, + "loss": 0.68059027, + "num_input_tokens_seen": 139815625, + "step": 6490, + "time_per_iteration": 2.960906505584717 + }, + { + "auxiliary_loss_clip": 0.01115137, + "auxiliary_loss_mlp": 0.00872926, + "balance_loss_clip": 1.01988983, + "balance_loss_mlp": 1.00127113, + "epoch": 0.7804966031383395, + "flos": 67001448038400.0, + "grad_norm": 0.7059091741782897, + "language_loss": 0.55831468, + "learning_rate": 4.843348407648569e-07, + "loss": 0.57819533, + "num_input_tokens_seen": 139876905, + "step": 6491, + "time_per_iteration": 3.20383620262146 + }, + { + "auxiliary_loss_clip": 0.01124943, + "auxiliary_loss_mlp": 0.01083188, + "balance_loss_clip": 1.02315569, + "balance_loss_mlp": 1.00298417, + "epoch": 0.7806168460289785, + "flos": 17740733057280.0, + "grad_norm": 2.073403898878156, + "language_loss": 0.83236021, + "learning_rate": 4.838267164180457e-07, + "loss": 0.85444152, + "num_input_tokens_seen": 139892575, + "step": 6492, + "time_per_iteration": 2.618985891342163 + }, + { + "auxiliary_loss_clip": 0.01136252, + "auxiliary_loss_mlp": 0.01084482, + "balance_loss_clip": 1.02636027, + "balance_loss_mlp": 1.00432622, + "epoch": 0.7807370889196176, + "flos": 23946240545280.0, + "grad_norm": 1.8178799670576011, + "language_loss": 0.83729577, + "learning_rate": 4.833188220735156e-07, + "loss": 0.85950315, + "num_input_tokens_seen": 139912245, + "step": 6493, + "time_per_iteration": 2.8966917991638184 + }, + { + "auxiliary_loss_clip": 0.01124323, + "auxiliary_loss_mlp": 0.010844, + "balance_loss_clip": 1.023857, + "balance_loss_mlp": 1.00429189, + "epoch": 0.7808573318102567, + "flos": 18989024457600.0, + "grad_norm": 1.9582056510841797, + "language_loss": 0.74870247, + "learning_rate": 4.828111578083152e-07, + "loss": 0.77078974, + "num_input_tokens_seen": 139929150, + "step": 6494, + "time_per_iteration": 2.719937562942505 + }, + { + "auxiliary_loss_clip": 0.01115145, + "auxiliary_loss_mlp": 0.01083659, + "balance_loss_clip": 1.02446032, + "balance_loss_mlp": 1.00350261, + "epoch": 0.7809775747008958, + "flos": 23980750536960.0, + "grad_norm": 2.014685423803451, + "language_loss": 0.81203854, + "learning_rate": 4.823037236994556e-07, + "loss": 0.83402658, + "num_input_tokens_seen": 139947315, + "step": 6495, + "time_per_iteration": 2.6878960132598877 + }, + { + "auxiliary_loss_clip": 0.01107203, + "auxiliary_loss_mlp": 0.0107906, + "balance_loss_clip": 1.01990449, + "balance_loss_mlp": 1.00009561, + "epoch": 0.7810978175915348, + "flos": 68535875180160.0, + "grad_norm": 0.7113478695419618, + "language_loss": 0.56367326, + "learning_rate": 4.817965198239136e-07, + "loss": 0.58553588, + "num_input_tokens_seen": 140013775, + "step": 6496, + "time_per_iteration": 4.10222315788269 + }, + { + "auxiliary_loss_clip": 0.0110747, + "auxiliary_loss_mlp": 0.01083443, + "balance_loss_clip": 1.02341223, + "balance_loss_mlp": 1.00333393, + "epoch": 0.781218060482174, + "flos": 19642131498240.0, + "grad_norm": 2.21254793235747, + "language_loss": 0.74334502, + "learning_rate": 4.812895462586331e-07, + "loss": 0.76525414, + "num_input_tokens_seen": 140031600, + "step": 6497, + "time_per_iteration": 2.728769540786743 + }, + { + "auxiliary_loss_clip": 0.01106967, + "auxiliary_loss_mlp": 0.01084379, + "balance_loss_clip": 1.02356696, + "balance_loss_mlp": 1.00436568, + "epoch": 0.7813383033728131, + "flos": 25627865621760.0, + "grad_norm": 1.6633557946599054, + "language_loss": 0.81906056, + "learning_rate": 4.807828030805207e-07, + "loss": 0.84097403, + "num_input_tokens_seen": 140050590, + "step": 6498, + "time_per_iteration": 4.5928380489349365 + }, + { + "auxiliary_loss_clip": 0.01119781, + "auxiliary_loss_mlp": 0.01084609, + "balance_loss_clip": 1.02107823, + "balance_loss_mlp": 1.00449991, + "epoch": 0.7814585462634521, + "flos": 20485924865280.0, + "grad_norm": 1.7683132155351189, + "language_loss": 0.67937219, + "learning_rate": 4.802762903664495e-07, + "loss": 0.70141613, + "num_input_tokens_seen": 140069770, + "step": 6499, + "time_per_iteration": 2.7195310592651367 + }, + { + "auxiliary_loss_clip": 0.01117294, + "auxiliary_loss_mlp": 0.01085764, + "balance_loss_clip": 1.02471995, + "balance_loss_mlp": 1.00546467, + "epoch": 0.7815787891540913, + "flos": 22304297018880.0, + "grad_norm": 2.201215914240378, + "language_loss": 0.73767334, + "learning_rate": 4.797700081932565e-07, + "loss": 0.75970387, + "num_input_tokens_seen": 140087635, + "step": 6500, + "time_per_iteration": 2.6953680515289307 + }, + { + "auxiliary_loss_clip": 0.0108959, + "auxiliary_loss_mlp": 0.01084912, + "balance_loss_clip": 1.02362609, + "balance_loss_mlp": 1.00485158, + "epoch": 0.7816990320447303, + "flos": 22600668136320.0, + "grad_norm": 2.036467359674591, + "language_loss": 0.81444079, + "learning_rate": 4.792639566377442e-07, + "loss": 0.83618581, + "num_input_tokens_seen": 140105045, + "step": 6501, + "time_per_iteration": 2.9020910263061523 + }, + { + "auxiliary_loss_clip": 0.0112663, + "auxiliary_loss_mlp": 0.01083591, + "balance_loss_clip": 1.02524328, + "balance_loss_mlp": 1.00357747, + "epoch": 0.7818192749353694, + "flos": 24935974871040.0, + "grad_norm": 1.739191303916695, + "language_loss": 0.77424562, + "learning_rate": 4.78758135776681e-07, + "loss": 0.79634786, + "num_input_tokens_seen": 140124900, + "step": 6502, + "time_per_iteration": 3.706501007080078 + }, + { + "auxiliary_loss_clip": 0.01114001, + "auxiliary_loss_mlp": 0.01084796, + "balance_loss_clip": 1.0229063, + "balance_loss_mlp": 1.00468755, + "epoch": 0.7819395178260086, + "flos": 23733039369600.0, + "grad_norm": 2.6130596877165337, + "language_loss": 0.78822154, + "learning_rate": 4.782525456867989e-07, + "loss": 0.81020951, + "num_input_tokens_seen": 140143755, + "step": 6503, + "time_per_iteration": 2.724954128265381 + }, + { + "auxiliary_loss_clip": 0.01089881, + "auxiliary_loss_mlp": 0.01084707, + "balance_loss_clip": 1.02450836, + "balance_loss_mlp": 1.00440729, + "epoch": 0.7820597607166476, + "flos": 23221671396480.0, + "grad_norm": 1.579322965806559, + "language_loss": 0.8324306, + "learning_rate": 4.777471864447959e-07, + "loss": 0.85417646, + "num_input_tokens_seen": 140164495, + "step": 6504, + "time_per_iteration": 2.8407840728759766 + }, + { + "auxiliary_loss_clip": 0.01119076, + "auxiliary_loss_mlp": 0.01084365, + "balance_loss_clip": 1.02501738, + "balance_loss_mlp": 1.00430393, + "epoch": 0.7821800036072867, + "flos": 22309540404480.0, + "grad_norm": 1.9005375317428155, + "language_loss": 0.80467582, + "learning_rate": 4.772420581273344e-07, + "loss": 0.82671022, + "num_input_tokens_seen": 140181980, + "step": 6505, + "time_per_iteration": 2.683182954788208 + }, + { + "auxiliary_loss_clip": 0.01119594, + "auxiliary_loss_mlp": 0.01084683, + "balance_loss_clip": 1.02525139, + "balance_loss_mlp": 1.00457406, + "epoch": 0.7823002464979258, + "flos": 21544176384000.0, + "grad_norm": 2.651374586999658, + "language_loss": 0.76217735, + "learning_rate": 4.7673716081104134e-07, + "loss": 0.78422004, + "num_input_tokens_seen": 140202155, + "step": 6506, + "time_per_iteration": 2.6447339057922363 + }, + { + "auxiliary_loss_clip": 0.01125983, + "auxiliary_loss_mlp": 0.01084327, + "balance_loss_clip": 1.02514076, + "balance_loss_mlp": 1.0042181, + "epoch": 0.7824204893885649, + "flos": 24535642815360.0, + "grad_norm": 1.785374227017233, + "language_loss": 0.84196615, + "learning_rate": 4.762324945725109e-07, + "loss": 0.86406922, + "num_input_tokens_seen": 140221600, + "step": 6507, + "time_per_iteration": 2.7192554473876953 + }, + { + "auxiliary_loss_clip": 0.01108672, + "auxiliary_loss_mlp": 0.01085402, + "balance_loss_clip": 1.02397692, + "balance_loss_mlp": 1.00534141, + "epoch": 0.782540732279204, + "flos": 27415211402880.0, + "grad_norm": 1.630045082714461, + "language_loss": 0.75764042, + "learning_rate": 4.7572805948829844e-07, + "loss": 0.77958119, + "num_input_tokens_seen": 140241860, + "step": 6508, + "time_per_iteration": 2.7693421840667725 + }, + { + "auxiliary_loss_clip": 0.01096949, + "auxiliary_loss_mlp": 0.01083258, + "balance_loss_clip": 1.02185893, + "balance_loss_mlp": 1.00314915, + "epoch": 0.7826609751698431, + "flos": 24353216616960.0, + "grad_norm": 1.6504372643419412, + "language_loss": 0.70884091, + "learning_rate": 4.7522385563492795e-07, + "loss": 0.73064297, + "num_input_tokens_seen": 140262160, + "step": 6509, + "time_per_iteration": 2.828585386276245 + }, + { + "auxiliary_loss_clip": 0.01108863, + "auxiliary_loss_mlp": 0.01084105, + "balance_loss_clip": 1.02459717, + "balance_loss_mlp": 1.0039959, + "epoch": 0.7827812180604822, + "flos": 23988543788160.0, + "grad_norm": 2.210085008135032, + "language_loss": 0.70269048, + "learning_rate": 4.747198830888863e-07, + "loss": 0.72462022, + "num_input_tokens_seen": 140282030, + "step": 6510, + "time_per_iteration": 2.7468152046203613 + }, + { + "auxiliary_loss_clip": 0.01117458, + "auxiliary_loss_mlp": 0.01084876, + "balance_loss_clip": 1.02524877, + "balance_loss_mlp": 1.00476778, + "epoch": 0.7829014609511212, + "flos": 27454318335360.0, + "grad_norm": 2.4115943412259044, + "language_loss": 0.68617153, + "learning_rate": 4.742161419266251e-07, + "loss": 0.70819485, + "num_input_tokens_seen": 140301190, + "step": 6511, + "time_per_iteration": 2.8614389896392822 + }, + { + "auxiliary_loss_clip": 0.01127148, + "auxiliary_loss_mlp": 0.01085475, + "balance_loss_clip": 1.02626741, + "balance_loss_mlp": 1.0052712, + "epoch": 0.7830217038417604, + "flos": 29204532432000.0, + "grad_norm": 2.680660586637645, + "language_loss": 0.64926308, + "learning_rate": 4.7371263222456304e-07, + "loss": 0.67138934, + "num_input_tokens_seen": 140318510, + "step": 6512, + "time_per_iteration": 2.747394323348999 + }, + { + "auxiliary_loss_clip": 0.01105581, + "auxiliary_loss_mlp": 0.01078847, + "balance_loss_clip": 1.01879334, + "balance_loss_mlp": 0.9998827, + "epoch": 0.7831419467323995, + "flos": 60950895822720.0, + "grad_norm": 0.7980141436622044, + "language_loss": 0.61439288, + "learning_rate": 4.7320935405908004e-07, + "loss": 0.63623714, + "num_input_tokens_seen": 140379380, + "step": 6513, + "time_per_iteration": 3.289121389389038 + }, + { + "auxiliary_loss_clip": 0.01136125, + "auxiliary_loss_mlp": 0.0108477, + "balance_loss_clip": 1.02640808, + "balance_loss_mlp": 1.00456619, + "epoch": 0.7832621896230385, + "flos": 19682531320320.0, + "grad_norm": 2.3951682391879707, + "language_loss": 0.84222353, + "learning_rate": 4.7270630750652475e-07, + "loss": 0.86443245, + "num_input_tokens_seen": 140395335, + "step": 6514, + "time_per_iteration": 2.61291766166687 + }, + { + "auxiliary_loss_clip": 0.01124484, + "auxiliary_loss_mlp": 0.0108319, + "balance_loss_clip": 1.02419806, + "balance_loss_mlp": 1.0030818, + "epoch": 0.7833824325136777, + "flos": 25009232659200.0, + "grad_norm": 1.7308143098046769, + "language_loss": 0.80363798, + "learning_rate": 4.7220349264320746e-07, + "loss": 0.82571471, + "num_input_tokens_seen": 140414420, + "step": 6515, + "time_per_iteration": 2.7278335094451904 + }, + { + "auxiliary_loss_clip": 0.01107518, + "auxiliary_loss_mlp": 0.01079125, + "balance_loss_clip": 1.02048421, + "balance_loss_mlp": 1.00016105, + "epoch": 0.7835026754043167, + "flos": 68800142517120.0, + "grad_norm": 0.7527285841928989, + "language_loss": 0.54958004, + "learning_rate": 4.71700909545407e-07, + "loss": 0.57144642, + "num_input_tokens_seen": 140477365, + "step": 6516, + "time_per_iteration": 3.2491135597229004 + }, + { + "auxiliary_loss_clip": 0.01127215, + "auxiliary_loss_mlp": 0.01083288, + "balance_loss_clip": 1.02610612, + "balance_loss_mlp": 1.00317907, + "epoch": 0.7836229182949558, + "flos": 19864598382720.0, + "grad_norm": 2.8883367283106707, + "language_loss": 0.76889884, + "learning_rate": 4.711985582893627e-07, + "loss": 0.79100394, + "num_input_tokens_seen": 140495885, + "step": 6517, + "time_per_iteration": 2.6797943115234375 + }, + { + "auxiliary_loss_clip": 0.01097544, + "auxiliary_loss_mlp": 0.01083571, + "balance_loss_clip": 1.02254176, + "balance_loss_mlp": 1.00341451, + "epoch": 0.783743161185595, + "flos": 22965843755520.0, + "grad_norm": 1.6747217155314766, + "language_loss": 0.7150203, + "learning_rate": 4.706964389512811e-07, + "loss": 0.73683143, + "num_input_tokens_seen": 140515920, + "step": 6518, + "time_per_iteration": 2.856980323791504 + }, + { + "auxiliary_loss_clip": 0.01134585, + "auxiliary_loss_mlp": 0.01084719, + "balance_loss_clip": 1.02574074, + "balance_loss_mlp": 1.00456274, + "epoch": 0.783863404076234, + "flos": 12458489777280.0, + "grad_norm": 1.8365124341173167, + "language_loss": 0.87219727, + "learning_rate": 4.701945516073345e-07, + "loss": 0.89439029, + "num_input_tokens_seen": 140533395, + "step": 6519, + "time_per_iteration": 2.5899949073791504 + }, + { + "auxiliary_loss_clip": 0.01103797, + "auxiliary_loss_mlp": 0.01083472, + "balance_loss_clip": 1.02214587, + "balance_loss_mlp": 1.00336313, + "epoch": 0.7839836469668731, + "flos": 24243940465920.0, + "grad_norm": 1.8959885006952732, + "language_loss": 0.74861807, + "learning_rate": 4.696928963336577e-07, + "loss": 0.77049077, + "num_input_tokens_seen": 140552825, + "step": 6520, + "time_per_iteration": 2.7962937355041504 + }, + { + "auxiliary_loss_clip": 0.01105778, + "auxiliary_loss_mlp": 0.01078883, + "balance_loss_clip": 1.0189954, + "balance_loss_mlp": 0.99991912, + "epoch": 0.7841038898575122, + "flos": 62121978938880.0, + "grad_norm": 0.853319288220648, + "language_loss": 0.61048782, + "learning_rate": 4.6919147320635224e-07, + "loss": 0.63233447, + "num_input_tokens_seen": 140615535, + "step": 6521, + "time_per_iteration": 3.1887407302856445 + }, + { + "auxiliary_loss_clip": 0.0112726, + "auxiliary_loss_mlp": 0.01083605, + "balance_loss_clip": 1.02583289, + "balance_loss_mlp": 1.00349641, + "epoch": 0.7842241327481513, + "flos": 20193899293440.0, + "grad_norm": 2.0746617242404124, + "language_loss": 0.72488642, + "learning_rate": 4.6869028230148286e-07, + "loss": 0.74699509, + "num_input_tokens_seen": 140633330, + "step": 6522, + "time_per_iteration": 3.536336898803711 + }, + { + "auxiliary_loss_clip": 0.01107342, + "auxiliary_loss_mlp": 0.01084742, + "balance_loss_clip": 1.02316463, + "balance_loss_mlp": 1.0044899, + "epoch": 0.7843443756387903, + "flos": 28074531496320.0, + "grad_norm": 2.375701515644629, + "language_loss": 0.59939539, + "learning_rate": 4.6818932369507957e-07, + "loss": 0.62131619, + "num_input_tokens_seen": 140652830, + "step": 6523, + "time_per_iteration": 2.779677391052246 + }, + { + "auxiliary_loss_clip": 0.01125127, + "auxiliary_loss_mlp": 0.01083924, + "balance_loss_clip": 1.02560568, + "balance_loss_mlp": 1.0038631, + "epoch": 0.7844646185294295, + "flos": 21323397438720.0, + "grad_norm": 3.1694729397287746, + "language_loss": 0.88931102, + "learning_rate": 4.676885974631386e-07, + "loss": 0.91140151, + "num_input_tokens_seen": 140671190, + "step": 6524, + "time_per_iteration": 4.514974594116211 + }, + { + "auxiliary_loss_clip": 0.01125605, + "auxiliary_loss_mlp": 0.01083963, + "balance_loss_clip": 1.02520823, + "balance_loss_mlp": 1.00390172, + "epoch": 0.7845848614200686, + "flos": 23656585271040.0, + "grad_norm": 2.119064737624798, + "language_loss": 0.81107974, + "learning_rate": 4.67188103681619e-07, + "loss": 0.83317554, + "num_input_tokens_seen": 140690975, + "step": 6525, + "time_per_iteration": 2.6696348190307617 + }, + { + "auxiliary_loss_clip": 0.01119965, + "auxiliary_loss_mlp": 0.00872892, + "balance_loss_clip": 1.0217663, + "balance_loss_mlp": 1.00004387, + "epoch": 0.7847051043107076, + "flos": 23402194174080.0, + "grad_norm": 2.1463895490307747, + "language_loss": 0.69101501, + "learning_rate": 4.666878424264453e-07, + "loss": 0.71094358, + "num_input_tokens_seen": 140710930, + "step": 6526, + "time_per_iteration": 2.731706380844116 + }, + { + "auxiliary_loss_clip": 0.01119203, + "auxiliary_loss_mlp": 0.01083714, + "balance_loss_clip": 1.02685893, + "balance_loss_mlp": 1.00379634, + "epoch": 0.7848253472013467, + "flos": 19022277473280.0, + "grad_norm": 1.6591820509408666, + "language_loss": 0.73903531, + "learning_rate": 4.661878137735069e-07, + "loss": 0.76106447, + "num_input_tokens_seen": 140729120, + "step": 6527, + "time_per_iteration": 2.721691370010376 + }, + { + "auxiliary_loss_clip": 0.01115615, + "auxiliary_loss_mlp": 0.01084353, + "balance_loss_clip": 1.02429819, + "balance_loss_mlp": 1.00433969, + "epoch": 0.7849455900919858, + "flos": 21179180332800.0, + "grad_norm": 2.730533947932729, + "language_loss": 0.74883568, + "learning_rate": 4.656880177986571e-07, + "loss": 0.7708354, + "num_input_tokens_seen": 140747665, + "step": 6528, + "time_per_iteration": 3.673219919204712 + }, + { + "auxiliary_loss_clip": 0.0111701, + "auxiliary_loss_mlp": 0.01084229, + "balance_loss_clip": 1.02401924, + "balance_loss_mlp": 1.00402451, + "epoch": 0.7850658329826249, + "flos": 19536482620800.0, + "grad_norm": 1.7609882639558665, + "language_loss": 0.81390107, + "learning_rate": 4.6518845457771607e-07, + "loss": 0.83591342, + "num_input_tokens_seen": 140766525, + "step": 6529, + "time_per_iteration": 2.749181032180786 + }, + { + "auxiliary_loss_clip": 0.01128514, + "auxiliary_loss_mlp": 0.00872753, + "balance_loss_clip": 1.02708673, + "balance_loss_mlp": 1.0000844, + "epoch": 0.7851860758732639, + "flos": 12495334152960.0, + "grad_norm": 1.7968659419609632, + "language_loss": 0.78451973, + "learning_rate": 4.646891241864652e-07, + "loss": 0.80453241, + "num_input_tokens_seen": 140785090, + "step": 6530, + "time_per_iteration": 2.6445369720458984 + }, + { + "auxiliary_loss_clip": 0.01125733, + "auxiliary_loss_mlp": 0.01084035, + "balance_loss_clip": 1.02389514, + "balance_loss_mlp": 1.00378358, + "epoch": 0.7853063187639031, + "flos": 22960959505920.0, + "grad_norm": 1.7848408117147547, + "language_loss": 0.73065543, + "learning_rate": 4.6419002670065397e-07, + "loss": 0.75275314, + "num_input_tokens_seen": 140804670, + "step": 6531, + "time_per_iteration": 2.743536949157715 + }, + { + "auxiliary_loss_clip": 0.011057, + "auxiliary_loss_mlp": 0.01085569, + "balance_loss_clip": 1.02339876, + "balance_loss_mlp": 1.00536513, + "epoch": 0.7854265616545422, + "flos": 17347260499200.0, + "grad_norm": 1.932666257742805, + "language_loss": 0.86879134, + "learning_rate": 4.6369116219599445e-07, + "loss": 0.89070392, + "num_input_tokens_seen": 140820655, + "step": 6532, + "time_per_iteration": 2.7830936908721924 + }, + { + "auxiliary_loss_clip": 0.01106651, + "auxiliary_loss_mlp": 0.01083732, + "balance_loss_clip": 1.02329659, + "balance_loss_mlp": 1.00367117, + "epoch": 0.7855468045451812, + "flos": 23838293197440.0, + "grad_norm": 1.658292587426606, + "language_loss": 0.79297042, + "learning_rate": 4.631925307481637e-07, + "loss": 0.81487423, + "num_input_tokens_seen": 140840470, + "step": 6533, + "time_per_iteration": 2.7773585319519043 + }, + { + "auxiliary_loss_clip": 0.0111579, + "auxiliary_loss_mlp": 0.01084016, + "balance_loss_clip": 1.0244956, + "balance_loss_mlp": 1.00400257, + "epoch": 0.7856670474358204, + "flos": 25666792986240.0, + "grad_norm": 2.0132337047847724, + "language_loss": 0.75584662, + "learning_rate": 4.6269413243280533e-07, + "loss": 0.77784467, + "num_input_tokens_seen": 140859890, + "step": 6534, + "time_per_iteration": 2.756743907928467 + }, + { + "auxiliary_loss_clip": 0.0111297, + "auxiliary_loss_mlp": 0.01084103, + "balance_loss_clip": 1.02266121, + "balance_loss_mlp": 1.00389886, + "epoch": 0.7857872903264594, + "flos": 18144656472960.0, + "grad_norm": 2.4643388551390593, + "language_loss": 0.74378324, + "learning_rate": 4.621959673255236e-07, + "loss": 0.76575398, + "num_input_tokens_seen": 140876190, + "step": 6535, + "time_per_iteration": 2.806572437286377 + }, + { + "auxiliary_loss_clip": 0.01093203, + "auxiliary_loss_mlp": 0.01083649, + "balance_loss_clip": 1.02456284, + "balance_loss_mlp": 1.0035404, + "epoch": 0.7859075332170985, + "flos": 14386138081920.0, + "grad_norm": 8.736107792963896, + "language_loss": 0.90498209, + "learning_rate": 4.6169803550189135e-07, + "loss": 0.92675054, + "num_input_tokens_seen": 140891885, + "step": 6536, + "time_per_iteration": 2.7381410598754883 + }, + { + "auxiliary_loss_clip": 0.0108659, + "auxiliary_loss_mlp": 0.01083613, + "balance_loss_clip": 1.02072644, + "balance_loss_mlp": 1.00336099, + "epoch": 0.7860277761077377, + "flos": 19864059678720.0, + "grad_norm": 1.8875014417293963, + "language_loss": 0.77465856, + "learning_rate": 4.6120033703744355e-07, + "loss": 0.79636061, + "num_input_tokens_seen": 140910780, + "step": 6537, + "time_per_iteration": 2.8747897148132324 + }, + { + "auxiliary_loss_clip": 0.01117286, + "auxiliary_loss_mlp": 0.0108393, + "balance_loss_clip": 1.02559447, + "balance_loss_mlp": 1.00386953, + "epoch": 0.7861480189983767, + "flos": 26396174557440.0, + "grad_norm": 1.807318016893254, + "language_loss": 0.78356802, + "learning_rate": 4.607028720076822e-07, + "loss": 0.80558026, + "num_input_tokens_seen": 140927460, + "step": 6538, + "time_per_iteration": 2.7781524658203125 + }, + { + "auxiliary_loss_clip": 0.01126243, + "auxiliary_loss_mlp": 0.01084201, + "balance_loss_clip": 1.02607608, + "balance_loss_mlp": 1.00409245, + "epoch": 0.7862682618890158, + "flos": 24236578177920.0, + "grad_norm": 1.8809939095877306, + "language_loss": 0.73156607, + "learning_rate": 4.6020564048807074e-07, + "loss": 0.75367057, + "num_input_tokens_seen": 140945135, + "step": 6539, + "time_per_iteration": 2.6711928844451904 + }, + { + "auxiliary_loss_clip": 0.01126776, + "auxiliary_loss_mlp": 0.01084067, + "balance_loss_clip": 1.02592254, + "balance_loss_mlp": 1.00395811, + "epoch": 0.7863885047796549, + "flos": 47551508259840.0, + "grad_norm": 2.5537239523755826, + "language_loss": 0.71770769, + "learning_rate": 4.5970864255403883e-07, + "loss": 0.73981607, + "num_input_tokens_seen": 140966660, + "step": 6540, + "time_per_iteration": 2.936739206314087 + }, + { + "auxiliary_loss_clip": 0.0112646, + "auxiliary_loss_mlp": 0.0108381, + "balance_loss_clip": 1.02573514, + "balance_loss_mlp": 1.00379622, + "epoch": 0.786508747670294, + "flos": 24389234979840.0, + "grad_norm": 1.7417823795560041, + "language_loss": 0.82200968, + "learning_rate": 4.59211878280982e-07, + "loss": 0.8441124, + "num_input_tokens_seen": 140986175, + "step": 6541, + "time_per_iteration": 2.6951370239257812 + }, + { + "auxiliary_loss_clip": 0.0111688, + "auxiliary_loss_mlp": 0.01082983, + "balance_loss_clip": 1.02486753, + "balance_loss_mlp": 1.00277865, + "epoch": 0.786628990560933, + "flos": 18041234238720.0, + "grad_norm": 2.1392243487859557, + "language_loss": 0.69600421, + "learning_rate": 4.587153477442578e-07, + "loss": 0.7180028, + "num_input_tokens_seen": 141002490, + "step": 6542, + "time_per_iteration": 2.7261486053466797 + }, + { + "auxiliary_loss_clip": 0.01135621, + "auxiliary_loss_mlp": 0.0108493, + "balance_loss_clip": 1.02602232, + "balance_loss_mlp": 1.00467813, + "epoch": 0.7867492334515722, + "flos": 25848860048640.0, + "grad_norm": 2.323540349389286, + "language_loss": 0.81299144, + "learning_rate": 4.582190510191899e-07, + "loss": 0.83519691, + "num_input_tokens_seen": 141021150, + "step": 6543, + "time_per_iteration": 2.6811254024505615 + }, + { + "auxiliary_loss_clip": 0.01105447, + "auxiliary_loss_mlp": 0.01084387, + "balance_loss_clip": 1.02355039, + "balance_loss_mlp": 1.00432658, + "epoch": 0.7868694763422113, + "flos": 16580819070720.0, + "grad_norm": 2.55428286584305, + "language_loss": 0.86750233, + "learning_rate": 4.5772298818106625e-07, + "loss": 0.88940066, + "num_input_tokens_seen": 141036940, + "step": 6544, + "time_per_iteration": 2.7311513423919678 + }, + { + "auxiliary_loss_clip": 0.01087674, + "auxiliary_loss_mlp": 0.01084022, + "balance_loss_clip": 1.02216172, + "balance_loss_mlp": 1.00377035, + "epoch": 0.7869897192328503, + "flos": 29386276272000.0, + "grad_norm": 2.298487296052815, + "language_loss": 0.71842802, + "learning_rate": 4.572271593051384e-07, + "loss": 0.74014503, + "num_input_tokens_seen": 141054295, + "step": 6545, + "time_per_iteration": 2.798858642578125 + }, + { + "auxiliary_loss_clip": 0.01097917, + "auxiliary_loss_mlp": 0.01085113, + "balance_loss_clip": 1.02406693, + "balance_loss_mlp": 1.00495696, + "epoch": 0.7871099621234895, + "flos": 17128923678720.0, + "grad_norm": 1.6513907174384381, + "language_loss": 0.78479874, + "learning_rate": 4.567315644666245e-07, + "loss": 0.80662906, + "num_input_tokens_seen": 141073090, + "step": 6546, + "time_per_iteration": 2.8004817962646484 + }, + { + "auxiliary_loss_clip": 0.01108966, + "auxiliary_loss_mlp": 0.01085116, + "balance_loss_clip": 1.02508855, + "balance_loss_mlp": 1.00505471, + "epoch": 0.7872302050141285, + "flos": 23440187784960.0, + "grad_norm": 2.0981141898873337, + "language_loss": 0.84418523, + "learning_rate": 4.5623620374070507e-07, + "loss": 0.86612606, + "num_input_tokens_seen": 141092405, + "step": 6547, + "time_per_iteration": 3.8030245304107666 + }, + { + "auxiliary_loss_clip": 0.01094547, + "auxiliary_loss_mlp": 0.01079123, + "balance_loss_clip": 1.02382994, + "balance_loss_mlp": 1.00015914, + "epoch": 0.7873504479047676, + "flos": 65959752689280.0, + "grad_norm": 0.7594247739766887, + "language_loss": 0.58410251, + "learning_rate": 4.557410772025263e-07, + "loss": 0.60583925, + "num_input_tokens_seen": 141154355, + "step": 6548, + "time_per_iteration": 3.3867523670196533 + }, + { + "auxiliary_loss_clip": 0.01118567, + "auxiliary_loss_mlp": 0.01084414, + "balance_loss_clip": 1.02552414, + "balance_loss_mlp": 1.00430512, + "epoch": 0.7874706907954068, + "flos": 23258336204160.0, + "grad_norm": 2.525653230610366, + "language_loss": 0.66331905, + "learning_rate": 4.5524618492719803e-07, + "loss": 0.68534887, + "num_input_tokens_seen": 141173575, + "step": 6549, + "time_per_iteration": 3.6943657398223877 + }, + { + "auxiliary_loss_clip": 0.01126405, + "auxiliary_loss_mlp": 0.0108366, + "balance_loss_clip": 1.02565944, + "balance_loss_mlp": 1.00359917, + "epoch": 0.7875909336860458, + "flos": 28767786963840.0, + "grad_norm": 1.5446344119036364, + "language_loss": 0.79129452, + "learning_rate": 4.54751526989795e-07, + "loss": 0.81339514, + "num_input_tokens_seen": 141195415, + "step": 6550, + "time_per_iteration": 2.74019718170166 + }, + { + "auxiliary_loss_clip": 0.01127299, + "auxiliary_loss_mlp": 0.01084822, + "balance_loss_clip": 1.02591884, + "balance_loss_mlp": 1.00461829, + "epoch": 0.7877111765766849, + "flos": 18697286194560.0, + "grad_norm": 1.938470142648655, + "language_loss": 0.79151958, + "learning_rate": 4.5425710346535775e-07, + "loss": 0.81364083, + "num_input_tokens_seen": 141213360, + "step": 6551, + "time_per_iteration": 2.665224552154541 + }, + { + "auxiliary_loss_clip": 0.01126397, + "auxiliary_loss_mlp": 0.01083555, + "balance_loss_clip": 1.02544236, + "balance_loss_mlp": 1.00339866, + "epoch": 0.787831419467324, + "flos": 27592968833280.0, + "grad_norm": 1.837285568569316, + "language_loss": 0.81482911, + "learning_rate": 4.537629144288877e-07, + "loss": 0.83692861, + "num_input_tokens_seen": 141230815, + "step": 6552, + "time_per_iteration": 3.548224687576294 + }, + { + "auxiliary_loss_clip": 0.01084205, + "auxiliary_loss_mlp": 0.01084433, + "balance_loss_clip": 1.02431917, + "balance_loss_mlp": 1.00437176, + "epoch": 0.7879516623579631, + "flos": 18150187167360.0, + "grad_norm": 1.9386710031024832, + "language_loss": 0.74924612, + "learning_rate": 4.5326895995535477e-07, + "loss": 0.77093244, + "num_input_tokens_seen": 141249715, + "step": 6553, + "time_per_iteration": 2.8183681964874268 + }, + { + "auxiliary_loss_clip": 0.01124017, + "auxiliary_loss_mlp": 0.0108412, + "balance_loss_clip": 1.02408195, + "balance_loss_mlp": 1.00396335, + "epoch": 0.7880719052486022, + "flos": 20339193807360.0, + "grad_norm": 2.4436024430450107, + "language_loss": 0.83942306, + "learning_rate": 4.527752401196907e-07, + "loss": 0.86150444, + "num_input_tokens_seen": 141267730, + "step": 6554, + "time_per_iteration": 2.7238821983337402 + }, + { + "auxiliary_loss_clip": 0.01117516, + "auxiliary_loss_mlp": 0.01083563, + "balance_loss_clip": 1.02458668, + "balance_loss_mlp": 1.00335932, + "epoch": 0.7881921481392413, + "flos": 21653237053440.0, + "grad_norm": 1.7497107432991623, + "language_loss": 0.66803205, + "learning_rate": 4.5228175499679254e-07, + "loss": 0.69004285, + "num_input_tokens_seen": 141287315, + "step": 6555, + "time_per_iteration": 2.7414333820343018 + }, + { + "auxiliary_loss_clip": 0.0110707, + "auxiliary_loss_mlp": 0.01078818, + "balance_loss_clip": 1.01991653, + "balance_loss_mlp": 0.99985331, + "epoch": 0.7883123910298804, + "flos": 68565860058240.0, + "grad_norm": 0.8219953802857264, + "language_loss": 0.5448311, + "learning_rate": 4.5178850466152174e-07, + "loss": 0.56668997, + "num_input_tokens_seen": 141346145, + "step": 6556, + "time_per_iteration": 3.3325634002685547 + }, + { + "auxiliary_loss_clip": 0.01119907, + "auxiliary_loss_mlp": 0.01083708, + "balance_loss_clip": 1.0264802, + "balance_loss_mlp": 1.00359917, + "epoch": 0.7884326339205194, + "flos": 19318217627520.0, + "grad_norm": 1.8152415162369289, + "language_loss": 0.8196559, + "learning_rate": 4.512954891887031e-07, + "loss": 0.84169203, + "num_input_tokens_seen": 141364445, + "step": 6557, + "time_per_iteration": 2.8246684074401855 + }, + { + "auxiliary_loss_clip": 0.01111551, + "auxiliary_loss_mlp": 0.0108401, + "balance_loss_clip": 1.02105832, + "balance_loss_mlp": 1.00380635, + "epoch": 0.7885528768111585, + "flos": 17784903807360.0, + "grad_norm": 2.0841564747636485, + "language_loss": 0.83319885, + "learning_rate": 4.5080270865312806e-07, + "loss": 0.85515451, + "num_input_tokens_seen": 141381640, + "step": 6558, + "time_per_iteration": 2.6867175102233887 + }, + { + "auxiliary_loss_clip": 0.01124551, + "auxiliary_loss_mlp": 0.01084187, + "balance_loss_clip": 1.02432799, + "balance_loss_mlp": 1.00398302, + "epoch": 0.7886731197017977, + "flos": 18807639753600.0, + "grad_norm": 2.996605992806401, + "language_loss": 0.71088159, + "learning_rate": 4.5031016312954985e-07, + "loss": 0.73296893, + "num_input_tokens_seen": 141399955, + "step": 6559, + "time_per_iteration": 2.7301361560821533 + }, + { + "auxiliary_loss_clip": 0.011279, + "auxiliary_loss_mlp": 0.01084316, + "balance_loss_clip": 1.02620864, + "balance_loss_mlp": 1.00411224, + "epoch": 0.7887933625924367, + "flos": 33365358126720.0, + "grad_norm": 1.9354274772434168, + "language_loss": 0.74253058, + "learning_rate": 4.498178526926886e-07, + "loss": 0.76465273, + "num_input_tokens_seen": 141420820, + "step": 6560, + "time_per_iteration": 2.7893412113189697 + }, + { + "auxiliary_loss_clip": 0.01134883, + "auxiliary_loss_mlp": 0.01084106, + "balance_loss_clip": 1.02605331, + "balance_loss_mlp": 1.00404453, + "epoch": 0.7889136054830758, + "flos": 17019360218880.0, + "grad_norm": 2.058263088657646, + "language_loss": 0.72419584, + "learning_rate": 4.4932577741722635e-07, + "loss": 0.74638569, + "num_input_tokens_seen": 141439350, + "step": 6561, + "time_per_iteration": 2.6745197772979736 + }, + { + "auxiliary_loss_clip": 0.01118704, + "auxiliary_loss_mlp": 0.01084612, + "balance_loss_clip": 1.02585089, + "balance_loss_mlp": 1.00450325, + "epoch": 0.7890338483737149, + "flos": 29424629018880.0, + "grad_norm": 2.4050607228243077, + "language_loss": 0.74193293, + "learning_rate": 4.4883393737780985e-07, + "loss": 0.76396608, + "num_input_tokens_seen": 141460300, + "step": 6562, + "time_per_iteration": 2.7957968711853027 + }, + { + "auxiliary_loss_clip": 0.01126788, + "auxiliary_loss_mlp": 0.01083517, + "balance_loss_clip": 1.02547562, + "balance_loss_mlp": 1.00345659, + "epoch": 0.789154091264354, + "flos": 19971576063360.0, + "grad_norm": 2.2325948672990465, + "language_loss": 0.78398597, + "learning_rate": 4.4834233264905254e-07, + "loss": 0.80608904, + "num_input_tokens_seen": 141477315, + "step": 6563, + "time_per_iteration": 2.655843734741211 + }, + { + "auxiliary_loss_clip": 0.01109418, + "auxiliary_loss_mlp": 0.0108443, + "balance_loss_clip": 1.0254941, + "balance_loss_mlp": 1.00422621, + "epoch": 0.789274334154993, + "flos": 14537825216640.0, + "grad_norm": 2.417469154570831, + "language_loss": 0.7213788, + "learning_rate": 4.478509633055294e-07, + "loss": 0.74331731, + "num_input_tokens_seen": 141495025, + "step": 6564, + "time_per_iteration": 2.787003993988037 + }, + { + "auxiliary_loss_clip": 0.01117233, + "auxiliary_loss_mlp": 0.01085871, + "balance_loss_clip": 1.02521789, + "balance_loss_mlp": 1.00557196, + "epoch": 0.7893945770456322, + "flos": 21827403123840.0, + "grad_norm": 2.4594287549147964, + "language_loss": 0.80546379, + "learning_rate": 4.473598294217813e-07, + "loss": 0.82749474, + "num_input_tokens_seen": 141510450, + "step": 6565, + "time_per_iteration": 2.688408136367798 + }, + { + "auxiliary_loss_clip": 0.01124165, + "auxiliary_loss_mlp": 0.01083665, + "balance_loss_clip": 1.02468061, + "balance_loss_mlp": 1.00360441, + "epoch": 0.7895148199362713, + "flos": 20740639184640.0, + "grad_norm": 2.326574323777497, + "language_loss": 0.71446788, + "learning_rate": 4.468689310723124e-07, + "loss": 0.73654616, + "num_input_tokens_seen": 141528265, + "step": 6566, + "time_per_iteration": 2.6888070106506348 + }, + { + "auxiliary_loss_clip": 0.01109041, + "auxiliary_loss_mlp": 0.01083903, + "balance_loss_clip": 1.02428186, + "balance_loss_mlp": 1.0038898, + "epoch": 0.7896350628269103, + "flos": 16690669839360.0, + "grad_norm": 1.6463532713983464, + "language_loss": 0.78500795, + "learning_rate": 4.463782683315913e-07, + "loss": 0.80693734, + "num_input_tokens_seen": 141547270, + "step": 6567, + "time_per_iteration": 2.7560877799987793 + }, + { + "auxiliary_loss_clip": 0.01134545, + "auxiliary_loss_mlp": 0.01084078, + "balance_loss_clip": 1.02561724, + "balance_loss_mlp": 1.00392175, + "epoch": 0.7897553057175495, + "flos": 22638374438400.0, + "grad_norm": 1.633792416928785, + "language_loss": 0.73333251, + "learning_rate": 4.458878412740523e-07, + "loss": 0.75551867, + "num_input_tokens_seen": 141566050, + "step": 6568, + "time_per_iteration": 2.7174618244171143 + }, + { + "auxiliary_loss_clip": 0.01120331, + "auxiliary_loss_mlp": 0.01085244, + "balance_loss_clip": 1.02571273, + "balance_loss_mlp": 1.00518358, + "epoch": 0.7898755486081885, + "flos": 14537573821440.0, + "grad_norm": 2.366741673985033, + "language_loss": 0.77970201, + "learning_rate": 4.453976499740919e-07, + "loss": 0.80175769, + "num_input_tokens_seen": 141583695, + "step": 6569, + "time_per_iteration": 2.625582695007324 + }, + { + "auxiliary_loss_clip": 0.01119936, + "auxiliary_loss_mlp": 0.01085147, + "balance_loss_clip": 1.02552509, + "balance_loss_mlp": 1.00494313, + "epoch": 0.7899957914988276, + "flos": 17238487138560.0, + "grad_norm": 1.6918477264316063, + "language_loss": 0.77808511, + "learning_rate": 4.4490769450607215e-07, + "loss": 0.80013597, + "num_input_tokens_seen": 141601320, + "step": 6570, + "time_per_iteration": 2.6732025146484375 + }, + { + "auxiliary_loss_clip": 0.01111358, + "auxiliary_loss_mlp": 0.01084967, + "balance_loss_clip": 1.02610195, + "balance_loss_mlp": 1.00481105, + "epoch": 0.7901160343894668, + "flos": 41279351086080.0, + "grad_norm": 1.9634500465234035, + "language_loss": 0.72466791, + "learning_rate": 4.4441797494431845e-07, + "loss": 0.74663115, + "num_input_tokens_seen": 141623125, + "step": 6571, + "time_per_iteration": 2.940093994140625 + }, + { + "auxiliary_loss_clip": 0.01124836, + "auxiliary_loss_mlp": 0.01084499, + "balance_loss_clip": 1.02522659, + "balance_loss_mlp": 1.00439024, + "epoch": 0.7902362772801058, + "flos": 16837005847680.0, + "grad_norm": 2.015203060557589, + "language_loss": 0.77892041, + "learning_rate": 4.439284913631207e-07, + "loss": 0.80101371, + "num_input_tokens_seen": 141640335, + "step": 6572, + "time_per_iteration": 2.758192300796509 + }, + { + "auxiliary_loss_clip": 0.01091153, + "auxiliary_loss_mlp": 0.01084288, + "balance_loss_clip": 1.02524233, + "balance_loss_mlp": 1.00408363, + "epoch": 0.7903565201707449, + "flos": 27125987091840.0, + "grad_norm": 2.2471356429834266, + "language_loss": 0.83670127, + "learning_rate": 4.434392438367347e-07, + "loss": 0.85845566, + "num_input_tokens_seen": 141659760, + "step": 6573, + "time_per_iteration": 3.665999174118042 + }, + { + "auxiliary_loss_clip": 0.01110745, + "auxiliary_loss_mlp": 0.01084086, + "balance_loss_clip": 1.02574778, + "balance_loss_mlp": 1.00392926, + "epoch": 0.790476763061384, + "flos": 31025167142400.0, + "grad_norm": 2.0349937367000095, + "language_loss": 0.73784208, + "learning_rate": 4.4295023243937677e-07, + "loss": 0.75979042, + "num_input_tokens_seen": 141679965, + "step": 6574, + "time_per_iteration": 4.554906368255615 + }, + { + "auxiliary_loss_clip": 0.01120841, + "auxiliary_loss_mlp": 0.0108374, + "balance_loss_clip": 1.02252126, + "balance_loss_mlp": 1.00348806, + "epoch": 0.7905970059520231, + "flos": 22089084681600.0, + "grad_norm": 1.8949814893167556, + "language_loss": 0.80094534, + "learning_rate": 4.4246145724523123e-07, + "loss": 0.82299113, + "num_input_tokens_seen": 141697710, + "step": 6575, + "time_per_iteration": 2.7344532012939453 + }, + { + "auxiliary_loss_clip": 0.01099119, + "auxiliary_loss_mlp": 0.01083953, + "balance_loss_clip": 1.02264631, + "balance_loss_mlp": 1.00384402, + "epoch": 0.7907172488426621, + "flos": 20558141159040.0, + "grad_norm": 1.9375685530099664, + "language_loss": 0.77387214, + "learning_rate": 4.41972918328444e-07, + "loss": 0.79570282, + "num_input_tokens_seen": 141715145, + "step": 6576, + "time_per_iteration": 2.8120594024658203 + }, + { + "auxiliary_loss_clip": 0.01125252, + "auxiliary_loss_mlp": 0.010836, + "balance_loss_clip": 1.02584279, + "balance_loss_mlp": 1.00349164, + "epoch": 0.7908374917333013, + "flos": 30081542901120.0, + "grad_norm": 2.212043842542395, + "language_loss": 0.7735194, + "learning_rate": 4.4148461576312646e-07, + "loss": 0.79560792, + "num_input_tokens_seen": 141734810, + "step": 6577, + "time_per_iteration": 2.738419532775879 + }, + { + "auxiliary_loss_clip": 0.01128147, + "auxiliary_loss_mlp": 0.01085027, + "balance_loss_clip": 1.02721798, + "balance_loss_mlp": 1.0049665, + "epoch": 0.7909577346239404, + "flos": 20996359084800.0, + "grad_norm": 1.5381776678752639, + "language_loss": 0.74533868, + "learning_rate": 4.4099654962335343e-07, + "loss": 0.76747042, + "num_input_tokens_seen": 141755260, + "step": 6578, + "time_per_iteration": 3.6033120155334473 + }, + { + "auxiliary_loss_clip": 0.01118091, + "auxiliary_loss_mlp": 0.0108452, + "balance_loss_clip": 1.02518559, + "balance_loss_mlp": 1.00431657, + "epoch": 0.7910779775145794, + "flos": 26247935128320.0, + "grad_norm": 1.8892143244724078, + "language_loss": 0.75307578, + "learning_rate": 4.405087199831636e-07, + "loss": 0.7751019, + "num_input_tokens_seen": 141775500, + "step": 6579, + "time_per_iteration": 2.8094136714935303 + }, + { + "auxiliary_loss_clip": 0.01116864, + "auxiliary_loss_mlp": 0.00872799, + "balance_loss_clip": 1.02434385, + "balance_loss_mlp": 1.00005054, + "epoch": 0.7911982204052186, + "flos": 22564434291840.0, + "grad_norm": 1.866308291324304, + "language_loss": 0.67430466, + "learning_rate": 4.400211269165619e-07, + "loss": 0.69420135, + "num_input_tokens_seen": 141791955, + "step": 6580, + "time_per_iteration": 2.7186837196350098 + }, + { + "auxiliary_loss_clip": 0.01138005, + "auxiliary_loss_mlp": 0.01083827, + "balance_loss_clip": 1.02823079, + "balance_loss_mlp": 1.00376654, + "epoch": 0.7913184632958576, + "flos": 23112538899840.0, + "grad_norm": 1.4016005707720873, + "language_loss": 0.7670638, + "learning_rate": 4.3953377049751416e-07, + "loss": 0.78928214, + "num_input_tokens_seen": 141812380, + "step": 6581, + "time_per_iteration": 2.6780896186828613 + }, + { + "auxiliary_loss_clip": 0.01117456, + "auxiliary_loss_mlp": 0.01084249, + "balance_loss_clip": 1.025491, + "balance_loss_mlp": 1.0041405, + "epoch": 0.7914387061864967, + "flos": 12311758719360.0, + "grad_norm": 5.135755761103821, + "language_loss": 0.77661562, + "learning_rate": 4.390466507999537e-07, + "loss": 0.79863268, + "num_input_tokens_seen": 141828130, + "step": 6582, + "time_per_iteration": 2.699420690536499 + }, + { + "auxiliary_loss_clip": 0.01099536, + "auxiliary_loss_mlp": 0.01084635, + "balance_loss_clip": 1.0230217, + "balance_loss_mlp": 1.00457442, + "epoch": 0.7915589490771359, + "flos": 17603267708160.0, + "grad_norm": 2.0343127469176627, + "language_loss": 0.75918496, + "learning_rate": 4.385597678977748e-07, + "loss": 0.7810266, + "num_input_tokens_seen": 141846965, + "step": 6583, + "time_per_iteration": 2.810148239135742 + }, + { + "auxiliary_loss_clip": 0.01112118, + "auxiliary_loss_mlp": 0.01084377, + "balance_loss_clip": 1.02127671, + "balance_loss_mlp": 1.0041728, + "epoch": 0.7916791919677749, + "flos": 25591272641280.0, + "grad_norm": 1.6264276119770624, + "language_loss": 0.75596619, + "learning_rate": 4.3807312186483726e-07, + "loss": 0.77793121, + "num_input_tokens_seen": 141867685, + "step": 6584, + "time_per_iteration": 2.7448389530181885 + }, + { + "auxiliary_loss_clip": 0.01125209, + "auxiliary_loss_mlp": 0.0108411, + "balance_loss_clip": 1.02612996, + "balance_loss_mlp": 1.00395358, + "epoch": 0.791799434858414, + "flos": 18844340474880.0, + "grad_norm": 1.9179759848283287, + "language_loss": 0.78760374, + "learning_rate": 4.375867127749655e-07, + "loss": 0.80969685, + "num_input_tokens_seen": 141885960, + "step": 6585, + "time_per_iteration": 2.7146589756011963 + }, + { + "auxiliary_loss_clip": 0.0110776, + "auxiliary_loss_mlp": 0.01085215, + "balance_loss_clip": 1.02489972, + "balance_loss_mlp": 1.00501108, + "epoch": 0.7919196777490531, + "flos": 25812015672960.0, + "grad_norm": 1.7409284288735767, + "language_loss": 0.66990077, + "learning_rate": 4.3710054070194744e-07, + "loss": 0.69183052, + "num_input_tokens_seen": 141905655, + "step": 6586, + "time_per_iteration": 2.769035577774048 + }, + { + "auxiliary_loss_clip": 0.01134903, + "auxiliary_loss_mlp": 0.00872922, + "balance_loss_clip": 1.02517068, + "balance_loss_mlp": 1.00006378, + "epoch": 0.7920399206396922, + "flos": 11947624594560.0, + "grad_norm": 2.8007052140861948, + "language_loss": 0.66517252, + "learning_rate": 4.3661460571953455e-07, + "loss": 0.68525076, + "num_input_tokens_seen": 141922390, + "step": 6587, + "time_per_iteration": 2.6813995838165283 + }, + { + "auxiliary_loss_clip": 0.01125877, + "auxiliary_loss_mlp": 0.01083023, + "balance_loss_clip": 1.02462018, + "balance_loss_mlp": 1.00296152, + "epoch": 0.7921601635303313, + "flos": 21579907438080.0, + "grad_norm": 1.7311337387067112, + "language_loss": 0.68602693, + "learning_rate": 4.36128907901443e-07, + "loss": 0.70811582, + "num_input_tokens_seen": 141941985, + "step": 6588, + "time_per_iteration": 2.688605785369873 + }, + { + "auxiliary_loss_clip": 0.01109394, + "auxiliary_loss_mlp": 0.01083177, + "balance_loss_clip": 1.02537894, + "balance_loss_mlp": 1.00306785, + "epoch": 0.7922804064209703, + "flos": 18113989236480.0, + "grad_norm": 2.3822946749518925, + "language_loss": 0.72353566, + "learning_rate": 4.356434473213519e-07, + "loss": 0.74546134, + "num_input_tokens_seen": 141959435, + "step": 6589, + "time_per_iteration": 2.787631034851074 + }, + { + "auxiliary_loss_clip": 0.01115831, + "auxiliary_loss_mlp": 0.01083869, + "balance_loss_clip": 1.02519608, + "balance_loss_mlp": 1.00376046, + "epoch": 0.7924006493116095, + "flos": 21652806090240.0, + "grad_norm": 1.552474092674391, + "language_loss": 0.79351157, + "learning_rate": 4.351582240529068e-07, + "loss": 0.81550854, + "num_input_tokens_seen": 141980265, + "step": 6590, + "time_per_iteration": 2.7773900032043457 + }, + { + "auxiliary_loss_clip": 0.01099307, + "auxiliary_loss_mlp": 0.01078948, + "balance_loss_clip": 1.01980531, + "balance_loss_mlp": 0.99998343, + "epoch": 0.7925208922022485, + "flos": 64242755694720.0, + "grad_norm": 0.6799691597869412, + "language_loss": 0.58270311, + "learning_rate": 4.346732381697149e-07, + "loss": 0.60448563, + "num_input_tokens_seen": 142044395, + "step": 6591, + "time_per_iteration": 3.332430839538574 + }, + { + "auxiliary_loss_clip": 0.01114598, + "auxiliary_loss_mlp": 0.01084048, + "balance_loss_clip": 1.02406454, + "balance_loss_mlp": 1.00393963, + "epoch": 0.7926411350928876, + "flos": 16941541403520.0, + "grad_norm": 1.8608503132077325, + "language_loss": 0.81202269, + "learning_rate": 4.3418848974534825e-07, + "loss": 0.83400911, + "num_input_tokens_seen": 142061335, + "step": 6592, + "time_per_iteration": 2.759636163711548 + }, + { + "auxiliary_loss_clip": 0.01110397, + "auxiliary_loss_mlp": 0.01084955, + "balance_loss_clip": 1.02629924, + "balance_loss_mlp": 1.00489438, + "epoch": 0.7927613779835267, + "flos": 34459987144320.0, + "grad_norm": 1.5630107961173312, + "language_loss": 0.68558478, + "learning_rate": 4.3370397885334276e-07, + "loss": 0.70753831, + "num_input_tokens_seen": 142081965, + "step": 6593, + "time_per_iteration": 2.816917657852173 + }, + { + "auxiliary_loss_clip": 0.01127681, + "auxiliary_loss_mlp": 0.01084303, + "balance_loss_clip": 1.02614939, + "balance_loss_mlp": 1.00409865, + "epoch": 0.7928816208741658, + "flos": 18951174501120.0, + "grad_norm": 1.6500723193918676, + "language_loss": 0.75430691, + "learning_rate": 4.3321970556719777e-07, + "loss": 0.77642673, + "num_input_tokens_seen": 142100260, + "step": 6594, + "time_per_iteration": 2.688645601272583 + }, + { + "auxiliary_loss_clip": 0.01136375, + "auxiliary_loss_mlp": 0.01083932, + "balance_loss_clip": 1.02703774, + "balance_loss_mlp": 1.00372767, + "epoch": 0.7930018637648049, + "flos": 18623022825600.0, + "grad_norm": 2.4448159583192997, + "language_loss": 0.71925318, + "learning_rate": 4.3273566996037856e-07, + "loss": 0.74145627, + "num_input_tokens_seen": 142116955, + "step": 6595, + "time_per_iteration": 2.615354299545288 + }, + { + "auxiliary_loss_clip": 0.01116636, + "auxiliary_loss_mlp": 0.01084385, + "balance_loss_clip": 1.02494705, + "balance_loss_mlp": 1.00427639, + "epoch": 0.793122106655444, + "flos": 24530650824960.0, + "grad_norm": 2.258484629637618, + "language_loss": 0.80536044, + "learning_rate": 4.322518721063113e-07, + "loss": 0.82737058, + "num_input_tokens_seen": 142135505, + "step": 6596, + "time_per_iteration": 2.733837604522705 + }, + { + "auxiliary_loss_clip": 0.01126595, + "auxiliary_loss_mlp": 0.01084474, + "balance_loss_clip": 1.02652717, + "balance_loss_mlp": 1.00427055, + "epoch": 0.7932423495460831, + "flos": 34421203434240.0, + "grad_norm": 1.8366012800670577, + "language_loss": 0.70345962, + "learning_rate": 4.3176831207838906e-07, + "loss": 0.72557032, + "num_input_tokens_seen": 142158915, + "step": 6597, + "time_per_iteration": 2.817760467529297 + }, + { + "auxiliary_loss_clip": 0.01127265, + "auxiliary_loss_mlp": 0.01083769, + "balance_loss_clip": 1.02794123, + "balance_loss_mlp": 1.00361252, + "epoch": 0.7933625924367221, + "flos": 26980333441920.0, + "grad_norm": 1.9705468257338183, + "language_loss": 0.74569607, + "learning_rate": 4.3128498994996685e-07, + "loss": 0.76780641, + "num_input_tokens_seen": 142178390, + "step": 6598, + "time_per_iteration": 3.5491387844085693 + }, + { + "auxiliary_loss_clip": 0.01126492, + "auxiliary_loss_mlp": 0.01084504, + "balance_loss_clip": 1.02512467, + "balance_loss_mlp": 1.00439513, + "epoch": 0.7934828353273613, + "flos": 29568630643200.0, + "grad_norm": 2.2381232703225984, + "language_loss": 0.71064878, + "learning_rate": 4.308019057943646e-07, + "loss": 0.73275864, + "num_input_tokens_seen": 142200115, + "step": 6599, + "time_per_iteration": 2.795487642288208 + }, + { + "auxiliary_loss_clip": 0.01083039, + "auxiliary_loss_mlp": 0.01083344, + "balance_loss_clip": 1.02433038, + "balance_loss_mlp": 1.00318766, + "epoch": 0.7936030782180004, + "flos": 28615381557120.0, + "grad_norm": 1.8238699560107507, + "language_loss": 0.74479973, + "learning_rate": 4.3031905968486535e-07, + "loss": 0.76646352, + "num_input_tokens_seen": 142220945, + "step": 6600, + "time_per_iteration": 4.731036901473999 + }, + { + "auxiliary_loss_clip": 0.01095101, + "auxiliary_loss_mlp": 0.01084248, + "balance_loss_clip": 1.02286077, + "balance_loss_mlp": 1.00413942, + "epoch": 0.7937233211086394, + "flos": 16392574869120.0, + "grad_norm": 1.8837704201705383, + "language_loss": 0.6878491, + "learning_rate": 4.298364516947162e-07, + "loss": 0.70964265, + "num_input_tokens_seen": 142238175, + "step": 6601, + "time_per_iteration": 2.8000447750091553 + }, + { + "auxiliary_loss_clip": 0.01098832, + "auxiliary_loss_mlp": 0.01084756, + "balance_loss_clip": 1.02399909, + "balance_loss_mlp": 1.00464714, + "epoch": 0.7938435639992786, + "flos": 22013420682240.0, + "grad_norm": 2.214607097540126, + "language_loss": 0.65829849, + "learning_rate": 4.293540818971295e-07, + "loss": 0.6801343, + "num_input_tokens_seen": 142255980, + "step": 6602, + "time_per_iteration": 2.8144187927246094 + }, + { + "auxiliary_loss_clip": 0.01126385, + "auxiliary_loss_mlp": 0.01084229, + "balance_loss_clip": 1.02503061, + "balance_loss_mlp": 1.00407267, + "epoch": 0.7939638068899176, + "flos": 22197032029440.0, + "grad_norm": 2.3427559779684852, + "language_loss": 0.76664376, + "learning_rate": 4.2887195036527934e-07, + "loss": 0.78874993, + "num_input_tokens_seen": 142274785, + "step": 6603, + "time_per_iteration": 2.658297300338745 + }, + { + "auxiliary_loss_clip": 0.01126391, + "auxiliary_loss_mlp": 0.01083679, + "balance_loss_clip": 1.02510238, + "balance_loss_mlp": 1.00347531, + "epoch": 0.7940840497805567, + "flos": 17745186343680.0, + "grad_norm": 2.5818981782808064, + "language_loss": 0.73203427, + "learning_rate": 4.28390057172306e-07, + "loss": 0.75413495, + "num_input_tokens_seen": 142291290, + "step": 6604, + "time_per_iteration": 3.704317569732666 + }, + { + "auxiliary_loss_clip": 0.01110434, + "auxiliary_loss_mlp": 0.01084853, + "balance_loss_clip": 1.02466154, + "balance_loss_mlp": 1.00460124, + "epoch": 0.7942042926711959, + "flos": 23805435231360.0, + "grad_norm": 1.9853097822715042, + "language_loss": 0.7173329, + "learning_rate": 4.279084023913111e-07, + "loss": 0.73928583, + "num_input_tokens_seen": 142309165, + "step": 6605, + "time_per_iteration": 2.8569226264953613 + }, + { + "auxiliary_loss_clip": 0.01126279, + "auxiliary_loss_mlp": 0.01084681, + "balance_loss_clip": 1.02646434, + "balance_loss_mlp": 1.00452435, + "epoch": 0.7943245355618349, + "flos": 19244959839360.0, + "grad_norm": 1.9614695643218771, + "language_loss": 0.69707966, + "learning_rate": 4.2742698609536096e-07, + "loss": 0.71918929, + "num_input_tokens_seen": 142327475, + "step": 6606, + "time_per_iteration": 2.6754226684570312 + }, + { + "auxiliary_loss_clip": 0.01116996, + "auxiliary_loss_mlp": 0.01083609, + "balance_loss_clip": 1.02504468, + "balance_loss_mlp": 1.00350046, + "epoch": 0.794444778452474, + "flos": 25007616547200.0, + "grad_norm": 1.6673499311294646, + "language_loss": 0.78826255, + "learning_rate": 4.2694580835748706e-07, + "loss": 0.81026858, + "num_input_tokens_seen": 142347335, + "step": 6607, + "time_per_iteration": 2.7687814235687256 + }, + { + "auxiliary_loss_clip": 0.01101902, + "auxiliary_loss_mlp": 0.01083867, + "balance_loss_clip": 1.02514982, + "balance_loss_mlp": 1.00380564, + "epoch": 0.7945650213431131, + "flos": 23221491828480.0, + "grad_norm": 2.107347326651038, + "language_loss": 0.73698854, + "learning_rate": 4.264648692506836e-07, + "loss": 0.75884628, + "num_input_tokens_seen": 142366125, + "step": 6608, + "time_per_iteration": 2.7098326683044434 + }, + { + "auxiliary_loss_clip": 0.01117018, + "auxiliary_loss_mlp": 0.01084276, + "balance_loss_clip": 1.02556252, + "balance_loss_mlp": 1.00397635, + "epoch": 0.7946852642337522, + "flos": 26062887237120.0, + "grad_norm": 1.9286676448701852, + "language_loss": 0.72228932, + "learning_rate": 4.2598416884790824e-07, + "loss": 0.74430227, + "num_input_tokens_seen": 142385175, + "step": 6609, + "time_per_iteration": 2.7799224853515625 + }, + { + "auxiliary_loss_clip": 0.01118428, + "auxiliary_loss_mlp": 0.01084168, + "balance_loss_clip": 1.02498865, + "balance_loss_mlp": 1.00405943, + "epoch": 0.7948055071243912, + "flos": 23769704177280.0, + "grad_norm": 1.8494466574730857, + "language_loss": 0.80743718, + "learning_rate": 4.255037072220828e-07, + "loss": 0.82946312, + "num_input_tokens_seen": 142406545, + "step": 6610, + "time_per_iteration": 2.7272205352783203 + }, + { + "auxiliary_loss_clip": 0.01134719, + "auxiliary_loss_mlp": 0.01083128, + "balance_loss_clip": 1.02567577, + "balance_loss_mlp": 1.00301909, + "epoch": 0.7949257500150304, + "flos": 21980814111360.0, + "grad_norm": 1.4988913371004595, + "language_loss": 0.71715629, + "learning_rate": 4.2502348444609293e-07, + "loss": 0.73933482, + "num_input_tokens_seen": 142426165, + "step": 6611, + "time_per_iteration": 2.6951019763946533 + }, + { + "auxiliary_loss_clip": 0.01099473, + "auxiliary_loss_mlp": 0.01084163, + "balance_loss_clip": 1.02328062, + "balance_loss_mlp": 1.00400651, + "epoch": 0.7950459929056695, + "flos": 25774129802880.0, + "grad_norm": 1.9399221524224994, + "language_loss": 0.69280612, + "learning_rate": 4.2454350059278844e-07, + "loss": 0.71464252, + "num_input_tokens_seen": 142447225, + "step": 6612, + "time_per_iteration": 2.8855152130126953 + }, + { + "auxiliary_loss_clip": 0.01116656, + "auxiliary_loss_mlp": 0.01083499, + "balance_loss_clip": 1.02340329, + "balance_loss_mlp": 1.00329483, + "epoch": 0.7951662357963085, + "flos": 22158068751360.0, + "grad_norm": 1.7271208263364932, + "language_loss": 0.84463477, + "learning_rate": 4.240637557349824e-07, + "loss": 0.86663628, + "num_input_tokens_seen": 142464440, + "step": 6613, + "time_per_iteration": 2.7816452980041504 + }, + { + "auxiliary_loss_clip": 0.01110331, + "auxiliary_loss_mlp": 0.01083652, + "balance_loss_clip": 1.02479196, + "balance_loss_mlp": 1.00354385, + "epoch": 0.7952864786869477, + "flos": 24641938137600.0, + "grad_norm": 2.433614849195554, + "language_loss": 0.66337299, + "learning_rate": 4.235842499454516e-07, + "loss": 0.68531281, + "num_input_tokens_seen": 142484355, + "step": 6614, + "time_per_iteration": 2.762709856033325 + }, + { + "auxiliary_loss_clip": 0.01115488, + "auxiliary_loss_mlp": 0.01085514, + "balance_loss_clip": 1.02385139, + "balance_loss_mlp": 1.00540578, + "epoch": 0.7954067215775867, + "flos": 21830922656640.0, + "grad_norm": 1.6453342260053285, + "language_loss": 0.82510394, + "learning_rate": 4.2310498329693687e-07, + "loss": 0.84711397, + "num_input_tokens_seen": 142505255, + "step": 6615, + "time_per_iteration": 2.75249981880188 + }, + { + "auxiliary_loss_clip": 0.01125789, + "auxiliary_loss_mlp": 0.01085472, + "balance_loss_clip": 1.0251174, + "balance_loss_mlp": 1.00522065, + "epoch": 0.7955269644682258, + "flos": 24060652341120.0, + "grad_norm": 6.602876466415944, + "language_loss": 0.80705953, + "learning_rate": 4.2262595586214164e-07, + "loss": 0.82917213, + "num_input_tokens_seen": 142526350, + "step": 6616, + "time_per_iteration": 2.7183902263641357 + }, + { + "auxiliary_loss_clip": 0.01127187, + "auxiliary_loss_mlp": 0.0108357, + "balance_loss_clip": 1.02601862, + "balance_loss_mlp": 1.00341368, + "epoch": 0.795647207358865, + "flos": 25010741030400.0, + "grad_norm": 1.54343859696142, + "language_loss": 0.7667973, + "learning_rate": 4.221471677137358e-07, + "loss": 0.78890485, + "num_input_tokens_seen": 142547165, + "step": 6617, + "time_per_iteration": 2.741649866104126 + }, + { + "auxiliary_loss_clip": 0.01116774, + "auxiliary_loss_mlp": 0.01083782, + "balance_loss_clip": 1.02523124, + "balance_loss_mlp": 1.00376856, + "epoch": 0.795767450249504, + "flos": 14648358343680.0, + "grad_norm": 1.5199531094576115, + "language_loss": 0.70257849, + "learning_rate": 4.216686189243492e-07, + "loss": 0.72458404, + "num_input_tokens_seen": 142565955, + "step": 6618, + "time_per_iteration": 2.7826201915740967 + }, + { + "auxiliary_loss_clip": 0.01105177, + "auxiliary_loss_mlp": 0.0108434, + "balance_loss_clip": 1.02319491, + "balance_loss_mlp": 1.00418377, + "epoch": 0.7958876931401431, + "flos": 18547897530240.0, + "grad_norm": 1.9512584897086713, + "language_loss": 0.72818822, + "learning_rate": 4.211903095665785e-07, + "loss": 0.75008345, + "num_input_tokens_seen": 142585340, + "step": 6619, + "time_per_iteration": 2.745051383972168 + }, + { + "auxiliary_loss_clip": 0.01123717, + "auxiliary_loss_mlp": 0.01083957, + "balance_loss_clip": 1.02427578, + "balance_loss_mlp": 1.00384843, + "epoch": 0.7960079360307821, + "flos": 21543960902400.0, + "grad_norm": 1.8940042066716862, + "language_loss": 0.75347483, + "learning_rate": 4.2071223971298277e-07, + "loss": 0.77555156, + "num_input_tokens_seen": 142602525, + "step": 6620, + "time_per_iteration": 2.6896469593048096 + }, + { + "auxiliary_loss_clip": 0.01128303, + "auxiliary_loss_mlp": 0.0108422, + "balance_loss_clip": 1.02678072, + "balance_loss_mlp": 1.00411129, + "epoch": 0.7961281789214213, + "flos": 25481745095040.0, + "grad_norm": 2.311283825463071, + "language_loss": 0.60887021, + "learning_rate": 4.2023440943608433e-07, + "loss": 0.63099539, + "num_input_tokens_seen": 142622490, + "step": 6621, + "time_per_iteration": 2.791876792907715 + }, + { + "auxiliary_loss_clip": 0.01125698, + "auxiliary_loss_mlp": 0.01083038, + "balance_loss_clip": 1.02478123, + "balance_loss_mlp": 1.00292921, + "epoch": 0.7962484218120603, + "flos": 21944436612480.0, + "grad_norm": 1.583244008272486, + "language_loss": 0.77931404, + "learning_rate": 4.1975681880837023e-07, + "loss": 0.80140138, + "num_input_tokens_seen": 142642495, + "step": 6622, + "time_per_iteration": 2.663661003112793 + }, + { + "auxiliary_loss_clip": 0.01108167, + "auxiliary_loss_mlp": 0.01084309, + "balance_loss_clip": 1.02406693, + "balance_loss_mlp": 1.00410533, + "epoch": 0.7963686647026994, + "flos": 18876264687360.0, + "grad_norm": 1.6090187002120004, + "language_loss": 0.82448745, + "learning_rate": 4.192794679022895e-07, + "loss": 0.84641218, + "num_input_tokens_seen": 142660820, + "step": 6623, + "time_per_iteration": 3.6139159202575684 + }, + { + "auxiliary_loss_clip": 0.01126112, + "auxiliary_loss_mlp": 0.01083359, + "balance_loss_clip": 1.02494264, + "balance_loss_mlp": 1.00329781, + "epoch": 0.7964889075933386, + "flos": 29716582763520.0, + "grad_norm": 1.7980962716478561, + "language_loss": 0.72306836, + "learning_rate": 4.1880235679025743e-07, + "loss": 0.74516308, + "num_input_tokens_seen": 142680915, + "step": 6624, + "time_per_iteration": 2.7064383029937744 + }, + { + "auxiliary_loss_clip": 0.01089897, + "auxiliary_loss_mlp": 0.01083426, + "balance_loss_clip": 1.02281225, + "balance_loss_mlp": 1.00331783, + "epoch": 0.7966091504839776, + "flos": 29491458272640.0, + "grad_norm": 1.7949781079411389, + "language_loss": 0.63761806, + "learning_rate": 4.1832548554464986e-07, + "loss": 0.65935135, + "num_input_tokens_seen": 142699210, + "step": 6625, + "time_per_iteration": 3.9018261432647705 + }, + { + "auxiliary_loss_clip": 0.01109471, + "auxiliary_loss_mlp": 0.01078838, + "balance_loss_clip": 1.02290165, + "balance_loss_mlp": 0.99987429, + "epoch": 0.7967293933746167, + "flos": 67288697101440.0, + "grad_norm": 0.7413050835862912, + "language_loss": 0.58717024, + "learning_rate": 4.178488542378098e-07, + "loss": 0.60905337, + "num_input_tokens_seen": 142756790, + "step": 6626, + "time_per_iteration": 4.022934675216675 + }, + { + "auxiliary_loss_clip": 0.01135726, + "auxiliary_loss_mlp": 0.01084139, + "balance_loss_clip": 1.02588713, + "balance_loss_mlp": 1.00398302, + "epoch": 0.7968496362652558, + "flos": 25554679660800.0, + "grad_norm": 1.6886818314051137, + "language_loss": 0.88801295, + "learning_rate": 4.173724629420401e-07, + "loss": 0.91021156, + "num_input_tokens_seen": 142778150, + "step": 6627, + "time_per_iteration": 2.651839017868042 + }, + { + "auxiliary_loss_clip": 0.01115937, + "auxiliary_loss_mlp": 0.0108392, + "balance_loss_clip": 1.02394724, + "balance_loss_mlp": 1.00371587, + "epoch": 0.7969698791558949, + "flos": 14501088581760.0, + "grad_norm": 2.504656417838514, + "language_loss": 0.68577588, + "learning_rate": 4.168963117296087e-07, + "loss": 0.7077744, + "num_input_tokens_seen": 142795485, + "step": 6628, + "time_per_iteration": 2.721799612045288 + }, + { + "auxiliary_loss_clip": 0.01135722, + "auxiliary_loss_mlp": 0.01083378, + "balance_loss_clip": 1.02661705, + "balance_loss_mlp": 1.00326967, + "epoch": 0.797090122046534, + "flos": 22127545169280.0, + "grad_norm": 2.1816345835178432, + "language_loss": 0.75729007, + "learning_rate": 4.1642040067274876e-07, + "loss": 0.77948117, + "num_input_tokens_seen": 142815155, + "step": 6629, + "time_per_iteration": 2.571690559387207 + }, + { + "auxiliary_loss_clip": 0.0111755, + "auxiliary_loss_mlp": 0.01084232, + "balance_loss_clip": 1.02447188, + "balance_loss_mlp": 1.00412321, + "epoch": 0.7972103649371731, + "flos": 19897671830400.0, + "grad_norm": 1.6144996662085076, + "language_loss": 0.72506154, + "learning_rate": 4.1594472984365493e-07, + "loss": 0.74707937, + "num_input_tokens_seen": 142833840, + "step": 6630, + "time_per_iteration": 3.76666259765625 + }, + { + "auxiliary_loss_clip": 0.01119997, + "auxiliary_loss_mlp": 0.01084603, + "balance_loss_clip": 1.02146733, + "balance_loss_mlp": 1.00449467, + "epoch": 0.7973306078278122, + "flos": 36058621847040.0, + "grad_norm": 1.8985459444817114, + "language_loss": 0.77587807, + "learning_rate": 4.154692993144862e-07, + "loss": 0.79792404, + "num_input_tokens_seen": 142853610, + "step": 6631, + "time_per_iteration": 2.817918539047241 + }, + { + "auxiliary_loss_clip": 0.01135376, + "auxiliary_loss_mlp": 0.00872864, + "balance_loss_clip": 1.02588844, + "balance_loss_mlp": 1.00008774, + "epoch": 0.7974508507184512, + "flos": 21360600950400.0, + "grad_norm": 1.9690995795399968, + "language_loss": 0.7107228, + "learning_rate": 4.1499410915736476e-07, + "loss": 0.73080516, + "num_input_tokens_seen": 142872540, + "step": 6632, + "time_per_iteration": 2.662830352783203 + }, + { + "auxiliary_loss_clip": 0.01106345, + "auxiliary_loss_mlp": 0.01078852, + "balance_loss_clip": 1.01920164, + "balance_loss_mlp": 0.99988765, + "epoch": 0.7975710936090904, + "flos": 68253115317120.0, + "grad_norm": 0.7630679335785642, + "language_loss": 0.64301795, + "learning_rate": 4.145191594443762e-07, + "loss": 0.6648699, + "num_input_tokens_seen": 142936895, + "step": 6633, + "time_per_iteration": 3.4062533378601074 + }, + { + "auxiliary_loss_clip": 0.01105836, + "auxiliary_loss_mlp": 0.01084823, + "balance_loss_clip": 1.02249992, + "balance_loss_mlp": 1.00461876, + "epoch": 0.7976913364997295, + "flos": 22492433479680.0, + "grad_norm": 1.8048372513150728, + "language_loss": 0.70326877, + "learning_rate": 4.140444502475713e-07, + "loss": 0.72517532, + "num_input_tokens_seen": 142956445, + "step": 6634, + "time_per_iteration": 2.7736542224884033 + }, + { + "auxiliary_loss_clip": 0.01126795, + "auxiliary_loss_mlp": 0.01083665, + "balance_loss_clip": 1.02483344, + "balance_loss_mlp": 1.00360429, + "epoch": 0.7978115793903685, + "flos": 15263220378240.0, + "grad_norm": 1.8571887979428696, + "language_loss": 0.69800079, + "learning_rate": 4.1356998163896216e-07, + "loss": 0.72010535, + "num_input_tokens_seen": 142973495, + "step": 6635, + "time_per_iteration": 2.635836362838745 + }, + { + "auxiliary_loss_clip": 0.0110743, + "auxiliary_loss_mlp": 0.01083605, + "balance_loss_clip": 1.02393579, + "balance_loss_mlp": 1.00349641, + "epoch": 0.7979318222810077, + "flos": 19719232041600.0, + "grad_norm": 1.975086788326542, + "language_loss": 0.74801052, + "learning_rate": 4.130957536905255e-07, + "loss": 0.76992083, + "num_input_tokens_seen": 142991510, + "step": 6636, + "time_per_iteration": 2.77874493598938 + }, + { + "auxiliary_loss_clip": 0.01101809, + "auxiliary_loss_mlp": 0.01084059, + "balance_loss_clip": 1.02572751, + "balance_loss_mlp": 1.00390303, + "epoch": 0.7980520651716467, + "flos": 15560273854080.0, + "grad_norm": 2.581571524684357, + "language_loss": 0.715617, + "learning_rate": 4.1262176647420134e-07, + "loss": 0.73747563, + "num_input_tokens_seen": 143009675, + "step": 6637, + "time_per_iteration": 2.7028167247772217 + }, + { + "auxiliary_loss_clip": 0.01117848, + "auxiliary_loss_mlp": 0.01083735, + "balance_loss_clip": 1.0247004, + "balance_loss_mlp": 1.00367427, + "epoch": 0.7981723080622858, + "flos": 22309432663680.0, + "grad_norm": 1.5679394181648094, + "language_loss": 0.79963404, + "learning_rate": 4.121480200618923e-07, + "loss": 0.82164991, + "num_input_tokens_seen": 143029330, + "step": 6638, + "time_per_iteration": 2.743645429611206 + }, + { + "auxiliary_loss_clip": 0.01116176, + "auxiliary_loss_mlp": 0.01083856, + "balance_loss_clip": 1.02459311, + "balance_loss_mlp": 1.00384235, + "epoch": 0.798292550952925, + "flos": 22929573997440.0, + "grad_norm": 1.747597506859594, + "language_loss": 0.80032289, + "learning_rate": 4.116745145254674e-07, + "loss": 0.8223232, + "num_input_tokens_seen": 143048865, + "step": 6639, + "time_per_iteration": 2.828433036804199 + }, + { + "auxiliary_loss_clip": 0.01096037, + "auxiliary_loss_mlp": 0.0107898, + "balance_loss_clip": 1.01843691, + "balance_loss_mlp": 1.0000155, + "epoch": 0.798412793843564, + "flos": 64497936890880.0, + "grad_norm": 0.7659255700581366, + "language_loss": 0.58033717, + "learning_rate": 4.1120124993675476e-07, + "loss": 0.60208732, + "num_input_tokens_seen": 143113295, + "step": 6640, + "time_per_iteration": 3.3091392517089844 + }, + { + "auxiliary_loss_clip": 0.01119274, + "auxiliary_loss_mlp": 0.01083961, + "balance_loss_clip": 1.0256145, + "balance_loss_mlp": 1.00380468, + "epoch": 0.7985330367342031, + "flos": 13586910514560.0, + "grad_norm": 2.1836554902817706, + "language_loss": 0.61578113, + "learning_rate": 4.107282263675498e-07, + "loss": 0.63781345, + "num_input_tokens_seen": 143130965, + "step": 6641, + "time_per_iteration": 2.670012950897217 + }, + { + "auxiliary_loss_clip": 0.01093996, + "auxiliary_loss_mlp": 0.00872951, + "balance_loss_clip": 1.02392888, + "balance_loss_mlp": 1.00125527, + "epoch": 0.7986532796248422, + "flos": 67698797656320.0, + "grad_norm": 0.7731049707255447, + "language_loss": 0.52540636, + "learning_rate": 4.1025544388960907e-07, + "loss": 0.54507589, + "num_input_tokens_seen": 143192005, + "step": 6642, + "time_per_iteration": 3.301316976547241 + }, + { + "auxiliary_loss_clip": 0.01126629, + "auxiliary_loss_mlp": 0.01083405, + "balance_loss_clip": 1.02710056, + "balance_loss_mlp": 1.00324845, + "epoch": 0.7987735225154813, + "flos": 22455373622400.0, + "grad_norm": 1.660075990599898, + "language_loss": 0.7171073, + "learning_rate": 4.097829025746538e-07, + "loss": 0.73920763, + "num_input_tokens_seen": 143213550, + "step": 6643, + "time_per_iteration": 2.7234606742858887 + }, + { + "auxiliary_loss_clip": 0.01106191, + "auxiliary_loss_mlp": 0.01078952, + "balance_loss_clip": 1.01938581, + "balance_loss_mlp": 0.9999882, + "epoch": 0.7988937654061203, + "flos": 68864098682880.0, + "grad_norm": 0.6581022699901702, + "language_loss": 0.6103946, + "learning_rate": 4.0931060249436757e-07, + "loss": 0.63224602, + "num_input_tokens_seen": 143277390, + "step": 6644, + "time_per_iteration": 3.2857120037078857 + }, + { + "auxiliary_loss_clip": 0.01125062, + "auxiliary_loss_mlp": 0.01084047, + "balance_loss_clip": 1.02602899, + "balance_loss_mlp": 1.00393856, + "epoch": 0.7990140082967595, + "flos": 20806893820800.0, + "grad_norm": 2.966817474438717, + "language_loss": 0.69802403, + "learning_rate": 4.088385437203978e-07, + "loss": 0.72011507, + "num_input_tokens_seen": 143294400, + "step": 6645, + "time_per_iteration": 2.772967576980591 + }, + { + "auxiliary_loss_clip": 0.01134293, + "auxiliary_loss_mlp": 0.01083694, + "balance_loss_clip": 1.02490163, + "balance_loss_mlp": 1.00353813, + "epoch": 0.7991342511873986, + "flos": 18985289443200.0, + "grad_norm": 2.276684266742009, + "language_loss": 0.77338475, + "learning_rate": 4.083667263243564e-07, + "loss": 0.79556465, + "num_input_tokens_seen": 143312745, + "step": 6646, + "time_per_iteration": 2.671694040298462 + }, + { + "auxiliary_loss_clip": 0.01119999, + "auxiliary_loss_mlp": 0.01083703, + "balance_loss_clip": 1.02203441, + "balance_loss_mlp": 1.00359392, + "epoch": 0.7992544940780376, + "flos": 20816805974400.0, + "grad_norm": 1.7087924145948397, + "language_loss": 0.71989536, + "learning_rate": 4.0789515037781653e-07, + "loss": 0.74193239, + "num_input_tokens_seen": 143333470, + "step": 6647, + "time_per_iteration": 2.727302074432373 + }, + { + "auxiliary_loss_clip": 0.01126872, + "auxiliary_loss_mlp": 0.01084267, + "balance_loss_clip": 1.02561772, + "balance_loss_mlp": 1.00406313, + "epoch": 0.7993747369686768, + "flos": 12640772321280.0, + "grad_norm": 1.6335243860590902, + "language_loss": 0.82691693, + "learning_rate": 4.0742381595231755e-07, + "loss": 0.84902829, + "num_input_tokens_seen": 143350195, + "step": 6648, + "time_per_iteration": 2.660794258117676 + }, + { + "auxiliary_loss_clip": 0.0109251, + "auxiliary_loss_mlp": 0.01084044, + "balance_loss_clip": 1.02549267, + "balance_loss_mlp": 1.00398362, + "epoch": 0.7994949798593158, + "flos": 20078769225600.0, + "grad_norm": 1.6361739113860556, + "language_loss": 0.78477103, + "learning_rate": 4.06952723119359e-07, + "loss": 0.80653656, + "num_input_tokens_seen": 143370070, + "step": 6649, + "time_per_iteration": 3.563080072402954 + }, + { + "auxiliary_loss_clip": 0.01116379, + "auxiliary_loss_mlp": 0.01084121, + "balance_loss_clip": 1.02492404, + "balance_loss_mlp": 1.0039165, + "epoch": 0.7996152227499549, + "flos": 38654209509120.0, + "grad_norm": 1.8848999424516635, + "language_loss": 0.67229193, + "learning_rate": 4.0648187195040504e-07, + "loss": 0.6942969, + "num_input_tokens_seen": 143392275, + "step": 6650, + "time_per_iteration": 2.8887410163879395 + }, + { + "auxiliary_loss_clip": 0.01109201, + "auxiliary_loss_mlp": 0.01078767, + "balance_loss_clip": 1.02263045, + "balance_loss_mlp": 0.99980259, + "epoch": 0.799735465640594, + "flos": 70243821947520.0, + "grad_norm": 0.8487602458638759, + "language_loss": 0.67573601, + "learning_rate": 4.060112625168848e-07, + "loss": 0.69761574, + "num_input_tokens_seen": 143457385, + "step": 6651, + "time_per_iteration": 5.094924688339233 + }, + { + "auxiliary_loss_clip": 0.01135987, + "auxiliary_loss_mlp": 0.01083714, + "balance_loss_clip": 1.02700734, + "balance_loss_mlp": 1.00365353, + "epoch": 0.7998557085312331, + "flos": 24240995550720.0, + "grad_norm": 2.535842236338093, + "language_loss": 0.73962617, + "learning_rate": 4.055408948901886e-07, + "loss": 0.76182318, + "num_input_tokens_seen": 143478785, + "step": 6652, + "time_per_iteration": 2.6296470165252686 + }, + { + "auxiliary_loss_clip": 0.01126543, + "auxiliary_loss_mlp": 0.01084048, + "balance_loss_clip": 1.02563703, + "balance_loss_mlp": 1.00379586, + "epoch": 0.7999759514218722, + "flos": 27564025449600.0, + "grad_norm": 2.416331072094702, + "language_loss": 0.71319276, + "learning_rate": 4.050707691416708e-07, + "loss": 0.73529863, + "num_input_tokens_seen": 143500095, + "step": 6653, + "time_per_iteration": 2.7507505416870117 + }, + { + "auxiliary_loss_clip": 0.01109016, + "auxiliary_loss_mlp": 0.01078876, + "balance_loss_clip": 1.02245164, + "balance_loss_mlp": 0.99991208, + "epoch": 0.8000961943125112, + "flos": 67337428878720.0, + "grad_norm": 0.6709448034944322, + "language_loss": 0.59799594, + "learning_rate": 4.046008853426495e-07, + "loss": 0.61987478, + "num_input_tokens_seen": 143563410, + "step": 6654, + "time_per_iteration": 3.3138206005096436 + }, + { + "auxiliary_loss_clip": 0.01107082, + "auxiliary_loss_mlp": 0.01085046, + "balance_loss_clip": 1.02460122, + "balance_loss_mlp": 1.0048902, + "epoch": 0.8002164372031504, + "flos": 28733815676160.0, + "grad_norm": 1.889187296466417, + "language_loss": 0.62555128, + "learning_rate": 4.0413124356440464e-07, + "loss": 0.6474725, + "num_input_tokens_seen": 143587455, + "step": 6655, + "time_per_iteration": 2.815782070159912 + }, + { + "auxiliary_loss_clip": 0.01098152, + "auxiliary_loss_mlp": 0.01084244, + "balance_loss_clip": 1.02278638, + "balance_loss_mlp": 1.00418282, + "epoch": 0.8003366800937894, + "flos": 17639429725440.0, + "grad_norm": 2.0968826429217486, + "language_loss": 0.82251358, + "learning_rate": 4.0366184387818223e-07, + "loss": 0.84433758, + "num_input_tokens_seen": 143605915, + "step": 6656, + "time_per_iteration": 3.8738346099853516 + }, + { + "auxiliary_loss_clip": 0.01135813, + "auxiliary_loss_mlp": 0.01084617, + "balance_loss_clip": 1.02552164, + "balance_loss_mlp": 1.00446117, + "epoch": 0.8004569229844285, + "flos": 25995303797760.0, + "grad_norm": 1.8731647455109202, + "language_loss": 0.84979272, + "learning_rate": 4.0319268635518797e-07, + "loss": 0.871997, + "num_input_tokens_seen": 143626490, + "step": 6657, + "time_per_iteration": 287.98531579971313 + }, + { + "auxiliary_loss_clip": 0.01126451, + "auxiliary_loss_mlp": 0.0108356, + "balance_loss_clip": 1.02576947, + "balance_loss_mlp": 1.00345135, + "epoch": 0.8005771658750677, + "flos": 20812352688000.0, + "grad_norm": 1.5377626371463011, + "language_loss": 0.75469768, + "learning_rate": 4.027237710665943e-07, + "loss": 0.77679783, + "num_input_tokens_seen": 143644955, + "step": 6658, + "time_per_iteration": 2.669299840927124 + }, + { + "auxiliary_loss_clip": 0.01107417, + "auxiliary_loss_mlp": 0.01083413, + "balance_loss_clip": 1.02373624, + "balance_loss_mlp": 1.00330472, + "epoch": 0.8006974087657067, + "flos": 25812626204160.0, + "grad_norm": 1.895828809607891, + "language_loss": 0.69717503, + "learning_rate": 4.022550980835344e-07, + "loss": 0.71908331, + "num_input_tokens_seen": 143667200, + "step": 6659, + "time_per_iteration": 2.850606918334961 + }, + { + "auxiliary_loss_clip": 0.01105318, + "auxiliary_loss_mlp": 0.01084259, + "balance_loss_clip": 1.02322698, + "balance_loss_mlp": 1.00415075, + "epoch": 0.8008176516563458, + "flos": 17164690646400.0, + "grad_norm": 2.051692971999445, + "language_loss": 0.79224885, + "learning_rate": 4.017866674771051e-07, + "loss": 0.81414455, + "num_input_tokens_seen": 143684685, + "step": 6660, + "time_per_iteration": 2.7685775756835938 + }, + { + "auxiliary_loss_clip": 0.01091639, + "auxiliary_loss_mlp": 0.0108478, + "balance_loss_clip": 1.02246571, + "balance_loss_mlp": 1.00452864, + "epoch": 0.8009378945469849, + "flos": 24207311571840.0, + "grad_norm": 1.5959013431373419, + "language_loss": 0.74456155, + "learning_rate": 4.013184793183688e-07, + "loss": 0.76632577, + "num_input_tokens_seen": 143706780, + "step": 6661, + "time_per_iteration": 2.8913650512695312 + }, + { + "auxiliary_loss_clip": 0.0112777, + "auxiliary_loss_mlp": 0.010851, + "balance_loss_clip": 1.02639842, + "balance_loss_mlp": 1.0048964, + "epoch": 0.801058137437624, + "flos": 19787318271360.0, + "grad_norm": 1.6437959776109565, + "language_loss": 0.72561699, + "learning_rate": 4.008505336783472e-07, + "loss": 0.74774569, + "num_input_tokens_seen": 143724505, + "step": 6662, + "time_per_iteration": 2.660547971725464 + }, + { + "auxiliary_loss_clip": 0.01126944, + "auxiliary_loss_mlp": 0.01084127, + "balance_loss_clip": 1.02645111, + "balance_loss_mlp": 1.00406623, + "epoch": 0.801178380328263, + "flos": 18659400324480.0, + "grad_norm": 1.6684066530831976, + "language_loss": 0.80485952, + "learning_rate": 4.003828306280284e-07, + "loss": 0.82697022, + "num_input_tokens_seen": 143742180, + "step": 6663, + "time_per_iteration": 2.7183032035827637 + }, + { + "auxiliary_loss_clip": 0.0111159, + "auxiliary_loss_mlp": 0.01083335, + "balance_loss_clip": 1.02731621, + "balance_loss_mlp": 1.00332177, + "epoch": 0.8012986232189022, + "flos": 15706573948800.0, + "grad_norm": 1.7223248395111022, + "language_loss": 0.77954239, + "learning_rate": 3.999153702383626e-07, + "loss": 0.80149162, + "num_input_tokens_seen": 143760070, + "step": 6664, + "time_per_iteration": 2.672257423400879 + }, + { + "auxiliary_loss_clip": 0.0112626, + "auxiliary_loss_mlp": 0.01083611, + "balance_loss_clip": 1.02502275, + "balance_loss_mlp": 1.00345516, + "epoch": 0.8014188661095413, + "flos": 28584139703040.0, + "grad_norm": 2.1420400451477355, + "language_loss": 0.73868001, + "learning_rate": 3.9944815258026263e-07, + "loss": 0.76077878, + "num_input_tokens_seen": 143781890, + "step": 6665, + "time_per_iteration": 2.769683361053467 + }, + { + "auxiliary_loss_clip": 0.01127381, + "auxiliary_loss_mlp": 0.01084655, + "balance_loss_clip": 1.02654624, + "balance_loss_mlp": 1.00454617, + "epoch": 0.8015391090001803, + "flos": 29310360877440.0, + "grad_norm": 1.9481944887688771, + "language_loss": 0.83127999, + "learning_rate": 3.989811777246057e-07, + "loss": 0.85340035, + "num_input_tokens_seen": 143802060, + "step": 6666, + "time_per_iteration": 2.7025492191314697 + }, + { + "auxiliary_loss_clip": 0.01114203, + "auxiliary_loss_mlp": 0.01078873, + "balance_loss_clip": 1.01897693, + "balance_loss_mlp": 0.99990892, + "epoch": 0.8016593518908195, + "flos": 70397340675840.0, + "grad_norm": 0.858432467737541, + "language_loss": 0.6626066, + "learning_rate": 3.985144457422305e-07, + "loss": 0.68453729, + "num_input_tokens_seen": 143856345, + "step": 6667, + "time_per_iteration": 3.1605441570281982 + }, + { + "auxiliary_loss_clip": 0.01136642, + "auxiliary_loss_mlp": 0.01084655, + "balance_loss_clip": 1.02676439, + "balance_loss_mlp": 1.0045464, + "epoch": 0.8017795947814585, + "flos": 26026114688640.0, + "grad_norm": 1.924094089873719, + "language_loss": 0.77026939, + "learning_rate": 3.9804795670394096e-07, + "loss": 0.79248238, + "num_input_tokens_seen": 143876470, + "step": 6668, + "time_per_iteration": 2.6934032440185547 + }, + { + "auxiliary_loss_clip": 0.01116411, + "auxiliary_loss_mlp": 0.01082825, + "balance_loss_clip": 1.02423525, + "balance_loss_mlp": 1.00281155, + "epoch": 0.8018998376720976, + "flos": 22087181260800.0, + "grad_norm": 1.743956258605362, + "language_loss": 0.70558083, + "learning_rate": 3.975817106805022e-07, + "loss": 0.72757322, + "num_input_tokens_seen": 143895170, + "step": 6669, + "time_per_iteration": 2.673186779022217 + }, + { + "auxiliary_loss_clip": 0.01105515, + "auxiliary_loss_mlp": 0.01084298, + "balance_loss_clip": 1.02321374, + "balance_loss_mlp": 1.00414157, + "epoch": 0.8020200805627368, + "flos": 34568545023360.0, + "grad_norm": 1.7570126357465556, + "language_loss": 0.64915639, + "learning_rate": 3.97115707742645e-07, + "loss": 0.67105454, + "num_input_tokens_seen": 143915845, + "step": 6670, + "time_per_iteration": 2.855515480041504 + }, + { + "auxiliary_loss_clip": 0.01116848, + "auxiliary_loss_mlp": 0.01083601, + "balance_loss_clip": 1.02588773, + "balance_loss_mlp": 1.0035882, + "epoch": 0.8021403234533758, + "flos": 20120354196480.0, + "grad_norm": 11.263399073029513, + "language_loss": 0.64671791, + "learning_rate": 3.966499479610599e-07, + "loss": 0.66872239, + "num_input_tokens_seen": 143933940, + "step": 6671, + "time_per_iteration": 2.681264638900757 + }, + { + "auxiliary_loss_clip": 0.01099651, + "auxiliary_loss_mlp": 0.01083963, + "balance_loss_clip": 1.02339005, + "balance_loss_mlp": 1.0039494, + "epoch": 0.8022605663440149, + "flos": 27746200252800.0, + "grad_norm": 1.669378796982204, + "language_loss": 0.64989918, + "learning_rate": 3.9618443140640225e-07, + "loss": 0.67173529, + "num_input_tokens_seen": 143952850, + "step": 6672, + "time_per_iteration": 2.834711790084839 + }, + { + "auxiliary_loss_clip": 0.01079361, + "auxiliary_loss_mlp": 0.01079112, + "balance_loss_clip": 1.01781011, + "balance_loss_mlp": 1.00014806, + "epoch": 0.802380809234654, + "flos": 60244998768000.0, + "grad_norm": 0.6842943274277173, + "language_loss": 0.51389229, + "learning_rate": 3.957191581492918e-07, + "loss": 0.53547704, + "num_input_tokens_seen": 144013610, + "step": 6673, + "time_per_iteration": 3.383392572402954 + }, + { + "auxiliary_loss_clip": 0.01117688, + "auxiliary_loss_mlp": 0.0108449, + "balance_loss_clip": 1.02599573, + "balance_loss_mlp": 1.00428581, + "epoch": 0.8025010521252931, + "flos": 15080722352640.0, + "grad_norm": 2.606328494537285, + "language_loss": 0.71349072, + "learning_rate": 3.952541282603097e-07, + "loss": 0.7355125, + "num_input_tokens_seen": 144028715, + "step": 6674, + "time_per_iteration": 3.583233594894409 + }, + { + "auxiliary_loss_clip": 0.01125002, + "auxiliary_loss_mlp": 0.0108479, + "balance_loss_clip": 1.02485228, + "balance_loss_mlp": 1.00463414, + "epoch": 0.8026212950159322, + "flos": 22163527618560.0, + "grad_norm": 3.127670430846929, + "language_loss": 0.83378625, + "learning_rate": 3.9478934181000013e-07, + "loss": 0.85588419, + "num_input_tokens_seen": 144048740, + "step": 6675, + "time_per_iteration": 2.707123279571533 + }, + { + "auxiliary_loss_clip": 0.0113565, + "auxiliary_loss_mlp": 0.01083102, + "balance_loss_clip": 1.02613688, + "balance_loss_mlp": 1.00289845, + "epoch": 0.8027415379065713, + "flos": 17675986792320.0, + "grad_norm": 2.0411263478265567, + "language_loss": 0.83903182, + "learning_rate": 3.943247988688714e-07, + "loss": 0.86121929, + "num_input_tokens_seen": 144067435, + "step": 6676, + "time_per_iteration": 3.6145143508911133 + }, + { + "auxiliary_loss_clip": 0.01127067, + "auxiliary_loss_mlp": 0.01083736, + "balance_loss_clip": 1.02605176, + "balance_loss_mlp": 1.00372267, + "epoch": 0.8028617807972104, + "flos": 21979593048960.0, + "grad_norm": 1.7016698530970882, + "language_loss": 0.72307092, + "learning_rate": 3.938604995073933e-07, + "loss": 0.74517894, + "num_input_tokens_seen": 144085905, + "step": 6677, + "time_per_iteration": 3.6546719074249268 + }, + { + "auxiliary_loss_clip": 0.01117783, + "auxiliary_loss_mlp": 0.01083848, + "balance_loss_clip": 1.02548456, + "balance_loss_mlp": 1.00373912, + "epoch": 0.8029820236878494, + "flos": 26428457905920.0, + "grad_norm": 1.7283013084646919, + "language_loss": 0.65495694, + "learning_rate": 3.9339644379600157e-07, + "loss": 0.67697322, + "num_input_tokens_seen": 144105735, + "step": 6678, + "time_per_iteration": 2.764566421508789 + }, + { + "auxiliary_loss_clip": 0.01111127, + "auxiliary_loss_mlp": 0.01084366, + "balance_loss_clip": 1.02693713, + "balance_loss_mlp": 1.00420928, + "epoch": 0.8031022665784886, + "flos": 17676489582720.0, + "grad_norm": 1.8196055685642138, + "language_loss": 0.70744616, + "learning_rate": 3.929326318050907e-07, + "loss": 0.72940111, + "num_input_tokens_seen": 144123405, + "step": 6679, + "time_per_iteration": 2.66593599319458 + }, + { + "auxiliary_loss_clip": 0.01134583, + "auxiliary_loss_mlp": 0.01083728, + "balance_loss_clip": 1.02511525, + "balance_loss_mlp": 1.00352407, + "epoch": 0.8032225094691277, + "flos": 15450279431040.0, + "grad_norm": 1.8714873186337497, + "language_loss": 0.78812182, + "learning_rate": 3.924690636050225e-07, + "loss": 0.81030488, + "num_input_tokens_seen": 144140815, + "step": 6680, + "time_per_iteration": 3.613344192504883 + }, + { + "auxiliary_loss_clip": 0.01125427, + "auxiliary_loss_mlp": 0.01083847, + "balance_loss_clip": 1.02565575, + "balance_loss_mlp": 1.0036428, + "epoch": 0.8033427523597667, + "flos": 26179202453760.0, + "grad_norm": 1.997188975449798, + "language_loss": 0.7334643, + "learning_rate": 3.9200573926611915e-07, + "loss": 0.75555706, + "num_input_tokens_seen": 144162230, + "step": 6681, + "time_per_iteration": 2.785543203353882 + }, + { + "auxiliary_loss_clip": 0.01121611, + "auxiliary_loss_mlp": 0.01084316, + "balance_loss_clip": 1.0227412, + "balance_loss_mlp": 1.00411177, + "epoch": 0.8034629952504058, + "flos": 21324905809920.0, + "grad_norm": 1.8172179823022614, + "language_loss": 0.72803307, + "learning_rate": 3.9154265885866613e-07, + "loss": 0.75009239, + "num_input_tokens_seen": 144181540, + "step": 6682, + "time_per_iteration": 2.6851084232330322 + }, + { + "auxiliary_loss_clip": 0.01119257, + "auxiliary_loss_mlp": 0.01083556, + "balance_loss_clip": 1.02520132, + "balance_loss_mlp": 1.00330424, + "epoch": 0.8035832381410449, + "flos": 21651585027840.0, + "grad_norm": 8.305547411586401, + "language_loss": 0.75029087, + "learning_rate": 3.9107982245291394e-07, + "loss": 0.77231902, + "num_input_tokens_seen": 144199665, + "step": 6683, + "time_per_iteration": 2.725189208984375 + }, + { + "auxiliary_loss_clip": 0.01099892, + "auxiliary_loss_mlp": 0.01084884, + "balance_loss_clip": 1.02339101, + "balance_loss_mlp": 1.00472724, + "epoch": 0.803703481031684, + "flos": 20518818744960.0, + "grad_norm": 2.871208704320136, + "language_loss": 0.77401346, + "learning_rate": 3.9061723011907245e-07, + "loss": 0.79586124, + "num_input_tokens_seen": 144219020, + "step": 6684, + "time_per_iteration": 2.6976425647735596 + }, + { + "auxiliary_loss_clip": 0.01116981, + "auxiliary_loss_mlp": 0.01083642, + "balance_loss_clip": 1.02498209, + "balance_loss_mlp": 1.00348544, + "epoch": 0.803823723922323, + "flos": 22854807838080.0, + "grad_norm": 1.7042862002281296, + "language_loss": 0.79198527, + "learning_rate": 3.901548819273179e-07, + "loss": 0.81399155, + "num_input_tokens_seen": 144239035, + "step": 6685, + "time_per_iteration": 2.7460837364196777 + }, + { + "auxiliary_loss_clip": 0.01127864, + "auxiliary_loss_mlp": 0.01083691, + "balance_loss_clip": 1.02742779, + "balance_loss_mlp": 1.00363016, + "epoch": 0.8039439668129622, + "flos": 21362145235200.0, + "grad_norm": 1.7113903230909915, + "language_loss": 0.69233203, + "learning_rate": 3.896927779477881e-07, + "loss": 0.71444762, + "num_input_tokens_seen": 144258295, + "step": 6686, + "time_per_iteration": 2.6687748432159424 + }, + { + "auxiliary_loss_clip": 0.01108388, + "auxiliary_loss_mlp": 0.01084017, + "balance_loss_clip": 1.02472997, + "balance_loss_mlp": 1.004004, + "epoch": 0.8040642097036013, + "flos": 23802382575360.0, + "grad_norm": 2.0044946188096926, + "language_loss": 0.66946453, + "learning_rate": 3.892309182505833e-07, + "loss": 0.69138861, + "num_input_tokens_seen": 144276110, + "step": 6687, + "time_per_iteration": 2.891003131866455 + }, + { + "auxiliary_loss_clip": 0.01134694, + "auxiliary_loss_mlp": 0.01084066, + "balance_loss_clip": 1.02514708, + "balance_loss_mlp": 1.00390959, + "epoch": 0.8041844525942403, + "flos": 25922046009600.0, + "grad_norm": 3.664261256457188, + "language_loss": 0.85985982, + "learning_rate": 3.887693029057675e-07, + "loss": 0.88204741, + "num_input_tokens_seen": 144295620, + "step": 6688, + "time_per_iteration": 2.648223400115967 + }, + { + "auxiliary_loss_clip": 0.01116422, + "auxiliary_loss_mlp": 0.01084188, + "balance_loss_clip": 1.02447963, + "balance_loss_mlp": 1.00417519, + "epoch": 0.8043046954848795, + "flos": 25191120153600.0, + "grad_norm": 1.557558834381904, + "language_loss": 0.81272733, + "learning_rate": 3.8830793198336684e-07, + "loss": 0.83473337, + "num_input_tokens_seen": 144315210, + "step": 6689, + "time_per_iteration": 2.7659428119659424 + }, + { + "auxiliary_loss_clip": 0.01111319, + "auxiliary_loss_mlp": 0.01084122, + "balance_loss_clip": 1.02645814, + "balance_loss_mlp": 1.0040133, + "epoch": 0.8044249383755185, + "flos": 41719185123840.0, + "grad_norm": 1.6791455655236693, + "language_loss": 0.70172977, + "learning_rate": 3.878468055533721e-07, + "loss": 0.72368419, + "num_input_tokens_seen": 144337750, + "step": 6690, + "time_per_iteration": 2.873368501663208 + }, + { + "auxiliary_loss_clip": 0.01107207, + "auxiliary_loss_mlp": 0.0108377, + "balance_loss_clip": 1.02391768, + "balance_loss_mlp": 1.00366163, + "epoch": 0.8045451812661576, + "flos": 20631434860800.0, + "grad_norm": 2.409312498808157, + "language_loss": 0.84473258, + "learning_rate": 3.8738592368573464e-07, + "loss": 0.86664236, + "num_input_tokens_seen": 144355305, + "step": 6691, + "time_per_iteration": 2.7234878540039062 + }, + { + "auxiliary_loss_clip": 0.01101752, + "auxiliary_loss_mlp": 0.01084732, + "balance_loss_clip": 1.02400923, + "balance_loss_mlp": 1.00443292, + "epoch": 0.8046654241567968, + "flos": 29711806254720.0, + "grad_norm": 1.8891973729566929, + "language_loss": 0.88236755, + "learning_rate": 3.8692528645037137e-07, + "loss": 0.90423238, + "num_input_tokens_seen": 144374485, + "step": 6692, + "time_per_iteration": 2.787419319152832 + }, + { + "auxiliary_loss_clip": 0.01136344, + "auxiliary_loss_mlp": 0.01083638, + "balance_loss_clip": 1.02693951, + "balance_loss_mlp": 1.00362515, + "epoch": 0.8047856670474358, + "flos": 17671389851520.0, + "grad_norm": 3.0746361401742734, + "language_loss": 0.77502024, + "learning_rate": 3.8646489391715907e-07, + "loss": 0.79722011, + "num_input_tokens_seen": 144388780, + "step": 6693, + "time_per_iteration": 2.5467894077301025 + }, + { + "auxiliary_loss_clip": 0.01109797, + "auxiliary_loss_mlp": 0.01084526, + "balance_loss_clip": 1.02402997, + "balance_loss_mlp": 1.00441766, + "epoch": 0.8049059099380749, + "flos": 17120699464320.0, + "grad_norm": 2.4048834428021895, + "language_loss": 0.88381255, + "learning_rate": 3.8600474615593903e-07, + "loss": 0.90575576, + "num_input_tokens_seen": 144403395, + "step": 6694, + "time_per_iteration": 2.6813759803771973 + }, + { + "auxiliary_loss_clip": 0.01087855, + "auxiliary_loss_mlp": 0.01078907, + "balance_loss_clip": 1.0171237, + "balance_loss_mlp": 0.99994278, + "epoch": 0.805026152828714, + "flos": 62212903240320.0, + "grad_norm": 0.7860586962650312, + "language_loss": 0.59704262, + "learning_rate": 3.8554484323651605e-07, + "loss": 0.61871022, + "num_input_tokens_seen": 144465265, + "step": 6695, + "time_per_iteration": 3.3257997035980225 + }, + { + "auxiliary_loss_clip": 0.01125428, + "auxiliary_loss_mlp": 0.00872881, + "balance_loss_clip": 1.02524555, + "balance_loss_mlp": 1.0000757, + "epoch": 0.8051463957193531, + "flos": 21688608971520.0, + "grad_norm": 1.5101605876312245, + "language_loss": 0.79305816, + "learning_rate": 3.85085185228657e-07, + "loss": 0.81304133, + "num_input_tokens_seen": 144484235, + "step": 6696, + "time_per_iteration": 2.6725504398345947 + }, + { + "auxiliary_loss_clip": 0.01117319, + "auxiliary_loss_mlp": 0.01085186, + "balance_loss_clip": 1.02528381, + "balance_loss_mlp": 1.00503004, + "epoch": 0.8052666386099921, + "flos": 32051458535040.0, + "grad_norm": 1.80602417802085, + "language_loss": 0.73159486, + "learning_rate": 3.8462577220209114e-07, + "loss": 0.75361997, + "num_input_tokens_seen": 144504610, + "step": 6697, + "time_per_iteration": 2.70027494430542 + }, + { + "auxiliary_loss_clip": 0.0111391, + "auxiliary_loss_mlp": 0.0107908, + "balance_loss_clip": 1.01873374, + "balance_loss_mlp": 1.00011563, + "epoch": 0.8053868815006313, + "flos": 67157875768320.0, + "grad_norm": 0.7095299982519482, + "language_loss": 0.59057111, + "learning_rate": 3.8416660422651127e-07, + "loss": 0.61250097, + "num_input_tokens_seen": 144574260, + "step": 6698, + "time_per_iteration": 3.2723991870880127 + }, + { + "auxiliary_loss_clip": 0.01106543, + "auxiliary_loss_mlp": 0.01084592, + "balance_loss_clip": 1.02292883, + "balance_loss_mlp": 1.00434017, + "epoch": 0.8055071243912704, + "flos": 23837000307840.0, + "grad_norm": 1.6808567369584082, + "language_loss": 0.67813885, + "learning_rate": 3.837076813715723e-07, + "loss": 0.70005023, + "num_input_tokens_seen": 144594145, + "step": 6699, + "time_per_iteration": 2.7740321159362793 + }, + { + "auxiliary_loss_clip": 0.01111528, + "auxiliary_loss_mlp": 0.01084525, + "balance_loss_clip": 1.02580607, + "balance_loss_mlp": 1.00422525, + "epoch": 0.8056273672819094, + "flos": 21324510760320.0, + "grad_norm": 1.8001126086696515, + "language_loss": 0.74815929, + "learning_rate": 3.832490037068941e-07, + "loss": 0.77011979, + "num_input_tokens_seen": 144612935, + "step": 6700, + "time_per_iteration": 4.680741310119629 + }, + { + "auxiliary_loss_clip": 0.01086502, + "auxiliary_loss_mlp": 0.01083428, + "balance_loss_clip": 1.02162266, + "balance_loss_mlp": 1.00327158, + "epoch": 0.8057476101725486, + "flos": 25768383626880.0, + "grad_norm": 1.9596000815660102, + "language_loss": 0.75644946, + "learning_rate": 3.827905713020554e-07, + "loss": 0.77814877, + "num_input_tokens_seen": 144630580, + "step": 6701, + "time_per_iteration": 2.8810572624206543 + }, + { + "auxiliary_loss_clip": 0.01111567, + "auxiliary_loss_mlp": 0.01085443, + "balance_loss_clip": 1.02542031, + "balance_loss_mlp": 1.00519133, + "epoch": 0.8058678530631876, + "flos": 24535283679360.0, + "grad_norm": 2.05949073766213, + "language_loss": 0.69033796, + "learning_rate": 3.823323842266017e-07, + "loss": 0.71230805, + "num_input_tokens_seen": 144649975, + "step": 6702, + "time_per_iteration": 2.8269975185394287 + }, + { + "auxiliary_loss_clip": 0.01126272, + "auxiliary_loss_mlp": 0.0108355, + "balance_loss_clip": 1.02508068, + "balance_loss_mlp": 1.00348938, + "epoch": 0.8059880959538267, + "flos": 24753728240640.0, + "grad_norm": 2.2963744215829647, + "language_loss": 0.72858638, + "learning_rate": 3.818744425500393e-07, + "loss": 0.75068462, + "num_input_tokens_seen": 144667990, + "step": 6703, + "time_per_iteration": 3.6049447059631348 + }, + { + "auxiliary_loss_clip": 0.01107131, + "auxiliary_loss_mlp": 0.01084756, + "balance_loss_clip": 1.02338839, + "balance_loss_mlp": 1.00460017, + "epoch": 0.8061083388444659, + "flos": 22196349671040.0, + "grad_norm": 1.6610059161000141, + "language_loss": 0.80532372, + "learning_rate": 3.8141674634183675e-07, + "loss": 0.82724261, + "num_input_tokens_seen": 144687020, + "step": 6704, + "time_per_iteration": 2.707977771759033 + }, + { + "auxiliary_loss_clip": 0.01094482, + "auxiliary_loss_mlp": 0.01083594, + "balance_loss_clip": 1.02242565, + "balance_loss_mlp": 1.00353336, + "epoch": 0.8062285817351049, + "flos": 30044195735040.0, + "grad_norm": 3.025120921849664, + "language_loss": 0.65927529, + "learning_rate": 3.809592956714278e-07, + "loss": 0.68105602, + "num_input_tokens_seen": 144710255, + "step": 6705, + "time_per_iteration": 2.897932767868042 + }, + { + "auxiliary_loss_clip": 0.01129004, + "auxiliary_loss_mlp": 0.01085114, + "balance_loss_clip": 1.02774143, + "balance_loss_mlp": 1.00491023, + "epoch": 0.806348824625744, + "flos": 22782591544320.0, + "grad_norm": 2.4445993801037074, + "language_loss": 0.74537051, + "learning_rate": 3.805020906082057e-07, + "loss": 0.76751173, + "num_input_tokens_seen": 144728830, + "step": 6706, + "time_per_iteration": 3.6370091438293457 + }, + { + "auxiliary_loss_clip": 0.01115607, + "auxiliary_loss_mlp": 0.01083503, + "balance_loss_clip": 1.02378011, + "balance_loss_mlp": 1.0033946, + "epoch": 0.8064690675163831, + "flos": 23404600385280.0, + "grad_norm": 1.8950463596327674, + "language_loss": 0.80800962, + "learning_rate": 3.8004513122152917e-07, + "loss": 0.8300007, + "num_input_tokens_seen": 144747140, + "step": 6707, + "time_per_iteration": 2.759835720062256 + }, + { + "auxiliary_loss_clip": 0.01112916, + "auxiliary_loss_mlp": 0.01084099, + "balance_loss_clip": 1.02240682, + "balance_loss_mlp": 1.00394273, + "epoch": 0.8065893104070222, + "flos": 24060903736320.0, + "grad_norm": 1.7759669520691261, + "language_loss": 0.67406201, + "learning_rate": 3.79588417580718e-07, + "loss": 0.69603217, + "num_input_tokens_seen": 144765250, + "step": 6708, + "time_per_iteration": 2.7682628631591797 + }, + { + "auxiliary_loss_clip": 0.01126512, + "auxiliary_loss_mlp": 0.01084266, + "balance_loss_clip": 1.02599251, + "balance_loss_mlp": 1.0041095, + "epoch": 0.8067095532976613, + "flos": 22305410340480.0, + "grad_norm": 1.910138689536877, + "language_loss": 0.76358289, + "learning_rate": 3.791319497550558e-07, + "loss": 0.78569067, + "num_input_tokens_seen": 144783080, + "step": 6709, + "time_per_iteration": 2.7138803005218506 + }, + { + "auxiliary_loss_clip": 0.01091252, + "auxiliary_loss_mlp": 0.00872788, + "balance_loss_clip": 1.02456081, + "balance_loss_mlp": 1.00005376, + "epoch": 0.8068297961883004, + "flos": 17129498296320.0, + "grad_norm": 1.8457032366921229, + "language_loss": 0.70467293, + "learning_rate": 3.78675727813788e-07, + "loss": 0.72431332, + "num_input_tokens_seen": 144800645, + "step": 6710, + "time_per_iteration": 2.7135658264160156 + }, + { + "auxiliary_loss_clip": 0.01113423, + "auxiliary_loss_mlp": 0.01085435, + "balance_loss_clip": 1.02254641, + "balance_loss_mlp": 1.00523138, + "epoch": 0.8069500390789395, + "flos": 22018843635840.0, + "grad_norm": 2.7206267584620694, + "language_loss": 0.7354821, + "learning_rate": 3.782197518261225e-07, + "loss": 0.75747073, + "num_input_tokens_seen": 144820085, + "step": 6711, + "time_per_iteration": 2.715836524963379 + }, + { + "auxiliary_loss_clip": 0.01119538, + "auxiliary_loss_mlp": 0.01084447, + "balance_loss_clip": 1.02643669, + "balance_loss_mlp": 1.00429058, + "epoch": 0.8070702819695785, + "flos": 19244241567360.0, + "grad_norm": 1.9094726797769719, + "language_loss": 0.95366991, + "learning_rate": 3.777640218612319e-07, + "loss": 0.9757098, + "num_input_tokens_seen": 144838070, + "step": 6712, + "time_per_iteration": 2.72218918800354 + }, + { + "auxiliary_loss_clip": 0.01118861, + "auxiliary_loss_mlp": 0.01084041, + "balance_loss_clip": 1.02509642, + "balance_loss_mlp": 1.00388443, + "epoch": 0.8071905248602176, + "flos": 21544320038400.0, + "grad_norm": 2.2719163860355516, + "language_loss": 0.72110218, + "learning_rate": 3.773085379882488e-07, + "loss": 0.74313122, + "num_input_tokens_seen": 144857125, + "step": 6713, + "time_per_iteration": 2.651449680328369 + }, + { + "auxiliary_loss_clip": 0.01127316, + "auxiliary_loss_mlp": 0.00872941, + "balance_loss_clip": 1.0258646, + "balance_loss_mlp": 1.00005317, + "epoch": 0.8073107677508568, + "flos": 37268309105280.0, + "grad_norm": 1.8889666329111487, + "language_loss": 0.75637192, + "learning_rate": 3.768533002762715e-07, + "loss": 0.77637446, + "num_input_tokens_seen": 144880660, + "step": 6714, + "time_per_iteration": 2.8532803058624268 + }, + { + "auxiliary_loss_clip": 0.01117518, + "auxiliary_loss_mlp": 0.01083632, + "balance_loss_clip": 1.02464342, + "balance_loss_mlp": 1.00357139, + "epoch": 0.8074310106414958, + "flos": 28366269759360.0, + "grad_norm": 5.907030253977695, + "language_loss": 0.7668817, + "learning_rate": 3.763983087943572e-07, + "loss": 0.78889316, + "num_input_tokens_seen": 144900050, + "step": 6715, + "time_per_iteration": 2.76127290725708 + }, + { + "auxiliary_loss_clip": 0.01127052, + "auxiliary_loss_mlp": 0.00872867, + "balance_loss_clip": 1.02558422, + "balance_loss_mlp": 1.00005376, + "epoch": 0.8075512535321349, + "flos": 24281646768000.0, + "grad_norm": 1.6718092130400226, + "language_loss": 0.80997133, + "learning_rate": 3.759435636115282e-07, + "loss": 0.82997048, + "num_input_tokens_seen": 144920835, + "step": 6716, + "time_per_iteration": 2.7394063472747803 + }, + { + "auxiliary_loss_clip": 0.01084021, + "auxiliary_loss_mlp": 0.00872861, + "balance_loss_clip": 1.02081585, + "balance_loss_mlp": 1.00011873, + "epoch": 0.807671496422774, + "flos": 26030855283840.0, + "grad_norm": 1.6878504506448169, + "language_loss": 0.72845548, + "learning_rate": 3.7548906479676967e-07, + "loss": 0.74802428, + "num_input_tokens_seen": 144940430, + "step": 6717, + "time_per_iteration": 2.912447690963745 + }, + { + "auxiliary_loss_clip": 0.0112617, + "auxiliary_loss_mlp": 0.01085195, + "balance_loss_clip": 1.02511609, + "balance_loss_mlp": 1.00499117, + "epoch": 0.8077917393134131, + "flos": 23730740899200.0, + "grad_norm": 1.6675228318632946, + "language_loss": 0.7160238, + "learning_rate": 3.7503481241902855e-07, + "loss": 0.73813742, + "num_input_tokens_seen": 144960405, + "step": 6718, + "time_per_iteration": 2.7505972385406494 + }, + { + "auxiliary_loss_clip": 0.01117981, + "auxiliary_loss_mlp": 0.00872832, + "balance_loss_clip": 1.02620125, + "balance_loss_mlp": 1.00009322, + "epoch": 0.8079119822040521, + "flos": 18402028398720.0, + "grad_norm": 5.192485828714286, + "language_loss": 0.80042827, + "learning_rate": 3.745808065472145e-07, + "loss": 0.82033634, + "num_input_tokens_seen": 144977700, + "step": 6719, + "time_per_iteration": 2.7427446842193604 + }, + { + "auxiliary_loss_clip": 0.01121261, + "auxiliary_loss_mlp": 0.0108375, + "balance_loss_clip": 1.02340233, + "balance_loss_mlp": 1.00364113, + "epoch": 0.8080322250946913, + "flos": 23621787970560.0, + "grad_norm": 1.4851529154909082, + "language_loss": 0.76252991, + "learning_rate": 3.741270472501994e-07, + "loss": 0.78457999, + "num_input_tokens_seen": 144998340, + "step": 6720, + "time_per_iteration": 2.724976062774658 + }, + { + "auxiliary_loss_clip": 0.01116072, + "auxiliary_loss_mlp": 0.01084387, + "balance_loss_clip": 1.02552223, + "balance_loss_mlp": 1.00427806, + "epoch": 0.8081524679853304, + "flos": 22820692896000.0, + "grad_norm": 1.6271409227082763, + "language_loss": 0.72637546, + "learning_rate": 3.736735345968183e-07, + "loss": 0.74838006, + "num_input_tokens_seen": 145017950, + "step": 6721, + "time_per_iteration": 2.7192020416259766 + }, + { + "auxiliary_loss_clip": 0.01126497, + "auxiliary_loss_mlp": 0.01083457, + "balance_loss_clip": 1.0257436, + "balance_loss_mlp": 1.00334835, + "epoch": 0.8082727108759694, + "flos": 17640004343040.0, + "grad_norm": 1.9044544248080921, + "language_loss": 0.78940666, + "learning_rate": 3.7322026865586986e-07, + "loss": 0.81150621, + "num_input_tokens_seen": 145036985, + "step": 6722, + "time_per_iteration": 2.646733522415161 + }, + { + "auxiliary_loss_clip": 0.01129881, + "auxiliary_loss_mlp": 0.01085119, + "balance_loss_clip": 1.02820635, + "balance_loss_mlp": 1.00501037, + "epoch": 0.8083929537666086, + "flos": 25958172113280.0, + "grad_norm": 2.051596626220601, + "language_loss": 0.73278546, + "learning_rate": 3.7276724949611206e-07, + "loss": 0.75493544, + "num_input_tokens_seen": 145057095, + "step": 6723, + "time_per_iteration": 2.7115516662597656 + }, + { + "auxiliary_loss_clip": 0.01115619, + "auxiliary_loss_mlp": 0.01084246, + "balance_loss_clip": 1.02432966, + "balance_loss_mlp": 1.00404167, + "epoch": 0.8085131966572476, + "flos": 27089178629760.0, + "grad_norm": 2.3228517793680368, + "language_loss": 0.75167006, + "learning_rate": 3.723144771862694e-07, + "loss": 0.77366871, + "num_input_tokens_seen": 145077735, + "step": 6724, + "time_per_iteration": 2.7696890830993652 + }, + { + "auxiliary_loss_clip": 0.01106127, + "auxiliary_loss_mlp": 0.01083981, + "balance_loss_clip": 1.02259207, + "balance_loss_mlp": 1.00392056, + "epoch": 0.8086334395478867, + "flos": 23988543788160.0, + "grad_norm": 1.56165439493065, + "language_loss": 0.76608884, + "learning_rate": 3.718619517950263e-07, + "loss": 0.78798997, + "num_input_tokens_seen": 145098330, + "step": 6725, + "time_per_iteration": 3.611091375350952 + }, + { + "auxiliary_loss_clip": 0.01136171, + "auxiliary_loss_mlp": 0.01084909, + "balance_loss_clip": 1.02687621, + "balance_loss_mlp": 1.0048002, + "epoch": 0.8087536824385259, + "flos": 20405879406720.0, + "grad_norm": 2.0451291610243567, + "language_loss": 0.769099, + "learning_rate": 3.714096733910301e-07, + "loss": 0.79130971, + "num_input_tokens_seen": 145115855, + "step": 6726, + "time_per_iteration": 3.6095762252807617 + }, + { + "auxiliary_loss_clip": 0.01113049, + "auxiliary_loss_mlp": 0.01085387, + "balance_loss_clip": 1.02774608, + "balance_loss_mlp": 1.00508714, + "epoch": 0.8088739253291649, + "flos": 25919639798400.0, + "grad_norm": 2.618137633921754, + "language_loss": 0.70643175, + "learning_rate": 3.709576420428926e-07, + "loss": 0.72841609, + "num_input_tokens_seen": 145136655, + "step": 6727, + "time_per_iteration": 2.7046420574188232 + }, + { + "auxiliary_loss_clip": 0.01117808, + "auxiliary_loss_mlp": 0.010836, + "balance_loss_clip": 1.02471566, + "balance_loss_mlp": 1.00353861, + "epoch": 0.808994168219804, + "flos": 28402072640640.0, + "grad_norm": 3.0438999739815884, + "language_loss": 0.7339077, + "learning_rate": 3.7050585781918463e-07, + "loss": 0.75592178, + "num_input_tokens_seen": 145156955, + "step": 6728, + "time_per_iteration": 2.7528719902038574 + }, + { + "auxiliary_loss_clip": 0.01128126, + "auxiliary_loss_mlp": 0.01084192, + "balance_loss_clip": 1.02668285, + "balance_loss_mlp": 1.00398767, + "epoch": 0.8091144111104431, + "flos": 17421056991360.0, + "grad_norm": 2.4183015376789583, + "language_loss": 0.69056404, + "learning_rate": 3.700543207884428e-07, + "loss": 0.71268725, + "num_input_tokens_seen": 145173865, + "step": 6729, + "time_per_iteration": 3.561673879623413 + }, + { + "auxiliary_loss_clip": 0.0112425, + "auxiliary_loss_mlp": 0.01084318, + "balance_loss_clip": 1.02442789, + "balance_loss_mlp": 1.00420976, + "epoch": 0.8092346540010822, + "flos": 32153803361280.0, + "grad_norm": 1.6669687323895084, + "language_loss": 0.71108687, + "learning_rate": 3.6960303101916466e-07, + "loss": 0.73317254, + "num_input_tokens_seen": 145193780, + "step": 6730, + "time_per_iteration": 2.745943069458008 + }, + { + "auxiliary_loss_clip": 0.01113932, + "auxiliary_loss_mlp": 0.0087299, + "balance_loss_clip": 1.018713, + "balance_loss_mlp": 1.0013243, + "epoch": 0.8093548968917212, + "flos": 58035093390720.0, + "grad_norm": 0.7489877721134749, + "language_loss": 0.5557915, + "learning_rate": 3.6915198857981047e-07, + "loss": 0.57566071, + "num_input_tokens_seen": 145258980, + "step": 6731, + "time_per_iteration": 4.1491858959198 + }, + { + "auxiliary_loss_clip": 0.01104414, + "auxiliary_loss_mlp": 0.01084463, + "balance_loss_clip": 1.02141738, + "balance_loss_mlp": 1.00430703, + "epoch": 0.8094751397823604, + "flos": 27381599251200.0, + "grad_norm": 1.6360434759084246, + "language_loss": 0.68212742, + "learning_rate": 3.687011935388027e-07, + "loss": 0.70401621, + "num_input_tokens_seen": 145281875, + "step": 6732, + "time_per_iteration": 2.7642834186553955 + }, + { + "auxiliary_loss_clip": 0.0112602, + "auxiliary_loss_mlp": 0.01084629, + "balance_loss_clip": 1.02587199, + "balance_loss_mlp": 1.00442541, + "epoch": 0.8095953826729995, + "flos": 24061083304320.0, + "grad_norm": 1.8189924184829922, + "language_loss": 0.72899008, + "learning_rate": 3.6825064596452646e-07, + "loss": 0.75109655, + "num_input_tokens_seen": 145302220, + "step": 6733, + "time_per_iteration": 2.7003700733184814 + }, + { + "auxiliary_loss_clip": 0.01125737, + "auxiliary_loss_mlp": 0.01084741, + "balance_loss_clip": 1.02499855, + "balance_loss_mlp": 1.00458515, + "epoch": 0.8097156255636385, + "flos": 23951412103680.0, + "grad_norm": 1.836189027394562, + "language_loss": 0.70822048, + "learning_rate": 3.678003459253305e-07, + "loss": 0.73032522, + "num_input_tokens_seen": 145323070, + "step": 6734, + "time_per_iteration": 2.6982362270355225 + }, + { + "auxiliary_loss_clip": 0.01106993, + "auxiliary_loss_mlp": 0.01083699, + "balance_loss_clip": 1.02412963, + "balance_loss_mlp": 1.00358999, + "epoch": 0.8098358684542777, + "flos": 21799142098560.0, + "grad_norm": 1.900266164691153, + "language_loss": 0.73961467, + "learning_rate": 3.673502934895236e-07, + "loss": 0.76152164, + "num_input_tokens_seen": 145342575, + "step": 6735, + "time_per_iteration": 2.7315757274627686 + }, + { + "auxiliary_loss_clip": 0.01113714, + "auxiliary_loss_mlp": 0.01079082, + "balance_loss_clip": 1.01853013, + "balance_loss_mlp": 1.00011754, + "epoch": 0.8099561113449167, + "flos": 68809515966720.0, + "grad_norm": 0.6832677410612525, + "language_loss": 0.57998884, + "learning_rate": 3.669004887253802e-07, + "loss": 0.60191679, + "num_input_tokens_seen": 145408865, + "step": 6736, + "time_per_iteration": 3.385908603668213 + }, + { + "auxiliary_loss_clip": 0.01119935, + "auxiliary_loss_mlp": 0.01084366, + "balance_loss_clip": 1.02719784, + "balance_loss_mlp": 1.00435233, + "epoch": 0.8100763542355558, + "flos": 23586056916480.0, + "grad_norm": 1.5503441278523384, + "language_loss": 0.78634191, + "learning_rate": 3.664509317011335e-07, + "loss": 0.8083849, + "num_input_tokens_seen": 145429200, + "step": 6737, + "time_per_iteration": 2.7035884857177734 + }, + { + "auxiliary_loss_clip": 0.01126636, + "auxiliary_loss_mlp": 0.01084143, + "balance_loss_clip": 1.02633357, + "balance_loss_mlp": 1.00389099, + "epoch": 0.810196597126195, + "flos": 31650408207360.0, + "grad_norm": 2.1954320161347662, + "language_loss": 0.737885, + "learning_rate": 3.6600162248498134e-07, + "loss": 0.75999278, + "num_input_tokens_seen": 145452830, + "step": 6738, + "time_per_iteration": 2.7534596920013428 + }, + { + "auxiliary_loss_clip": 0.01087247, + "auxiliary_loss_mlp": 0.01083529, + "balance_loss_clip": 1.02217603, + "balance_loss_mlp": 1.00342083, + "epoch": 0.810316840016834, + "flos": 24900459298560.0, + "grad_norm": 1.64098716025823, + "language_loss": 0.75965941, + "learning_rate": 3.6555256114508426e-07, + "loss": 0.78136718, + "num_input_tokens_seen": 145472625, + "step": 6739, + "time_per_iteration": 2.848024368286133 + }, + { + "auxiliary_loss_clip": 0.01116905, + "auxiliary_loss_mlp": 0.01084493, + "balance_loss_clip": 1.02344155, + "balance_loss_mlp": 1.00424075, + "epoch": 0.8104370829074731, + "flos": 27965003950080.0, + "grad_norm": 1.806494942282553, + "language_loss": 0.73155904, + "learning_rate": 3.651037477495642e-07, + "loss": 0.75357294, + "num_input_tokens_seen": 145494075, + "step": 6740, + "time_per_iteration": 2.8915884494781494 + }, + { + "auxiliary_loss_clip": 0.01134058, + "auxiliary_loss_mlp": 0.01084419, + "balance_loss_clip": 1.02513909, + "balance_loss_mlp": 1.00426304, + "epoch": 0.8105573257981122, + "flos": 24640752988800.0, + "grad_norm": 1.8077488609342256, + "language_loss": 0.6831643, + "learning_rate": 3.6465518236650584e-07, + "loss": 0.70534909, + "num_input_tokens_seen": 145514220, + "step": 6741, + "time_per_iteration": 2.6252031326293945 + }, + { + "auxiliary_loss_clip": 0.01105274, + "auxiliary_loss_mlp": 0.01083678, + "balance_loss_clip": 1.02279854, + "balance_loss_mlp": 1.00361753, + "epoch": 0.8106775686887513, + "flos": 26358935132160.0, + "grad_norm": 1.655104607849092, + "language_loss": 0.78325582, + "learning_rate": 3.642068650639558e-07, + "loss": 0.80514538, + "num_input_tokens_seen": 145533965, + "step": 6742, + "time_per_iteration": 2.8775875568389893 + }, + { + "auxiliary_loss_clip": 0.01118356, + "auxiliary_loss_mlp": 0.01084045, + "balance_loss_clip": 1.02509189, + "balance_loss_mlp": 1.00403166, + "epoch": 0.8107978115793903, + "flos": 27271892136960.0, + "grad_norm": 1.9714779106241225, + "language_loss": 0.64653671, + "learning_rate": 3.6375879590992334e-07, + "loss": 0.66856068, + "num_input_tokens_seen": 145554310, + "step": 6743, + "time_per_iteration": 2.8218741416931152 + }, + { + "auxiliary_loss_clip": 0.01117936, + "auxiliary_loss_mlp": 0.01083047, + "balance_loss_clip": 1.02567697, + "balance_loss_mlp": 1.00289035, + "epoch": 0.8109180544700295, + "flos": 24934322845440.0, + "grad_norm": 1.7377501703261424, + "language_loss": 0.80913633, + "learning_rate": 3.6331097497238173e-07, + "loss": 0.83114618, + "num_input_tokens_seen": 145573755, + "step": 6744, + "time_per_iteration": 2.766155481338501 + }, + { + "auxiliary_loss_clip": 0.0110474, + "auxiliary_loss_mlp": 0.01083828, + "balance_loss_clip": 1.02256548, + "balance_loss_mlp": 1.00371909, + "epoch": 0.8110382973606686, + "flos": 21105383840640.0, + "grad_norm": 2.2087799139116964, + "language_loss": 0.80224586, + "learning_rate": 3.628634023192627e-07, + "loss": 0.82413161, + "num_input_tokens_seen": 145594000, + "step": 6745, + "time_per_iteration": 2.8358097076416016 + }, + { + "auxiliary_loss_clip": 0.01125635, + "auxiliary_loss_mlp": 0.01084348, + "balance_loss_clip": 1.0249114, + "balance_loss_mlp": 1.00419188, + "epoch": 0.8111585402513076, + "flos": 15414081500160.0, + "grad_norm": 2.226971146276585, + "language_loss": 0.75223541, + "learning_rate": 3.624160780184644e-07, + "loss": 0.77433521, + "num_input_tokens_seen": 145611215, + "step": 6746, + "time_per_iteration": 2.613300085067749 + }, + { + "auxiliary_loss_clip": 0.01116994, + "auxiliary_loss_mlp": 0.01084435, + "balance_loss_clip": 1.02453041, + "balance_loss_mlp": 1.00423062, + "epoch": 0.8112787831419467, + "flos": 24095736950400.0, + "grad_norm": 2.128962586487113, + "language_loss": 0.74661124, + "learning_rate": 3.6196900213784496e-07, + "loss": 0.7686255, + "num_input_tokens_seen": 145630530, + "step": 6747, + "time_per_iteration": 2.761542797088623 + }, + { + "auxiliary_loss_clip": 0.01125964, + "auxiliary_loss_mlp": 0.01084848, + "balance_loss_clip": 1.0251267, + "balance_loss_mlp": 1.00473905, + "epoch": 0.8113990260325858, + "flos": 20483374999680.0, + "grad_norm": 1.8533620660481207, + "language_loss": 0.86504447, + "learning_rate": 3.6152217474522527e-07, + "loss": 0.88715261, + "num_input_tokens_seen": 145647345, + "step": 6748, + "time_per_iteration": 2.71248459815979 + }, + { + "auxiliary_loss_clip": 0.01125965, + "auxiliary_loss_mlp": 0.01085215, + "balance_loss_clip": 1.02639961, + "balance_loss_mlp": 1.00515449, + "epoch": 0.8115192689232249, + "flos": 24901141656960.0, + "grad_norm": 1.5258835694344666, + "language_loss": 0.72942388, + "learning_rate": 3.6107559590838975e-07, + "loss": 0.75153571, + "num_input_tokens_seen": 145666330, + "step": 6749, + "time_per_iteration": 2.677949905395508 + }, + { + "auxiliary_loss_clip": 0.01084704, + "auxiliary_loss_mlp": 0.0108401, + "balance_loss_clip": 1.01963127, + "balance_loss_mlp": 1.00390148, + "epoch": 0.811639511813864, + "flos": 24057204635520.0, + "grad_norm": 2.372758626589222, + "language_loss": 0.66042238, + "learning_rate": 3.606292656950822e-07, + "loss": 0.68210948, + "num_input_tokens_seen": 145684740, + "step": 6750, + "time_per_iteration": 3.741011619567871 + }, + { + "auxiliary_loss_clip": 0.01116412, + "auxiliary_loss_mlp": 0.01083181, + "balance_loss_clip": 1.02391875, + "balance_loss_mlp": 1.00297701, + "epoch": 0.8117597547045031, + "flos": 23185150243200.0, + "grad_norm": 1.9225121553356126, + "language_loss": 0.86437583, + "learning_rate": 3.601831841730121e-07, + "loss": 0.88637173, + "num_input_tokens_seen": 145702660, + "step": 6751, + "time_per_iteration": 3.6508004665374756 + }, + { + "auxiliary_loss_clip": 0.01124839, + "auxiliary_loss_mlp": 0.0108548, + "balance_loss_clip": 1.02460122, + "balance_loss_mlp": 1.00532365, + "epoch": 0.8118799975951422, + "flos": 23040250778880.0, + "grad_norm": 1.53509691080898, + "language_loss": 0.72417682, + "learning_rate": 3.5973735140984916e-07, + "loss": 0.74628001, + "num_input_tokens_seen": 145722830, + "step": 6752, + "time_per_iteration": 2.676711082458496 + }, + { + "auxiliary_loss_clip": 0.0108063, + "auxiliary_loss_mlp": 0.00872886, + "balance_loss_clip": 1.02282262, + "balance_loss_mlp": 1.00005221, + "epoch": 0.8120002404857812, + "flos": 24639962889600.0, + "grad_norm": 2.3474770916196532, + "language_loss": 0.79487944, + "learning_rate": 3.5929176747322607e-07, + "loss": 0.81441456, + "num_input_tokens_seen": 145741935, + "step": 6753, + "time_per_iteration": 2.8238179683685303 + }, + { + "auxiliary_loss_clip": 0.01097901, + "auxiliary_loss_mlp": 0.01079104, + "balance_loss_clip": 1.01839709, + "balance_loss_mlp": 1.00013947, + "epoch": 0.8121204833764204, + "flos": 57415742156160.0, + "grad_norm": 0.8088444729933614, + "language_loss": 0.56218249, + "learning_rate": 3.588464324307372e-07, + "loss": 0.58395255, + "num_input_tokens_seen": 145805560, + "step": 6754, + "time_per_iteration": 4.2791383266448975 + }, + { + "auxiliary_loss_clip": 0.01125793, + "auxiliary_loss_mlp": 0.01083961, + "balance_loss_clip": 1.02476728, + "balance_loss_mlp": 1.00380445, + "epoch": 0.8122407262670595, + "flos": 19464589549440.0, + "grad_norm": 3.335401071475578, + "language_loss": 0.74932945, + "learning_rate": 3.584013463499391e-07, + "loss": 0.77142704, + "num_input_tokens_seen": 145824180, + "step": 6755, + "time_per_iteration": 2.66378116607666 + }, + { + "auxiliary_loss_clip": 0.01096765, + "auxiliary_loss_mlp": 0.01078931, + "balance_loss_clip": 1.01807082, + "balance_loss_mlp": 0.9999671, + "epoch": 0.8123609691576985, + "flos": 56425325472000.0, + "grad_norm": 0.7422208007044124, + "language_loss": 0.64462668, + "learning_rate": 3.579565092983521e-07, + "loss": 0.66638368, + "num_input_tokens_seen": 145885300, + "step": 6756, + "time_per_iteration": 4.0997536182403564 + }, + { + "auxiliary_loss_clip": 0.01134658, + "auxiliary_loss_mlp": 0.01084645, + "balance_loss_clip": 1.02539766, + "balance_loss_mlp": 1.0044415, + "epoch": 0.8124812120483377, + "flos": 20631973564800.0, + "grad_norm": 2.0825610698413968, + "language_loss": 0.83835769, + "learning_rate": 3.575119213434565e-07, + "loss": 0.86055076, + "num_input_tokens_seen": 145903815, + "step": 6757, + "time_per_iteration": 2.6371841430664062 + }, + { + "auxiliary_loss_clip": 0.01124931, + "auxiliary_loss_mlp": 0.01083948, + "balance_loss_clip": 1.02523851, + "balance_loss_mlp": 1.00393522, + "epoch": 0.8126014549389767, + "flos": 22492397566080.0, + "grad_norm": 1.764754118368094, + "language_loss": 0.81837833, + "learning_rate": 3.5706758255269765e-07, + "loss": 0.84046715, + "num_input_tokens_seen": 145922270, + "step": 6758, + "time_per_iteration": 2.6946568489074707 + }, + { + "auxiliary_loss_clip": 0.0111705, + "auxiliary_loss_mlp": 0.01085105, + "balance_loss_clip": 1.02533603, + "balance_loss_mlp": 1.00485301, + "epoch": 0.8127216978296158, + "flos": 23287961946240.0, + "grad_norm": 1.5743007704039, + "language_loss": 0.69644892, + "learning_rate": 3.566234929934795e-07, + "loss": 0.71847045, + "num_input_tokens_seen": 145941470, + "step": 6759, + "time_per_iteration": 2.6777608394622803 + }, + { + "auxiliary_loss_clip": 0.01120725, + "auxiliary_loss_mlp": 0.01083431, + "balance_loss_clip": 1.02205098, + "balance_loss_mlp": 1.00341821, + "epoch": 0.812841940720255, + "flos": 25154994049920.0, + "grad_norm": 1.3856424700750678, + "language_loss": 0.71865565, + "learning_rate": 3.561796527331706e-07, + "loss": 0.74069726, + "num_input_tokens_seen": 145963145, + "step": 6760, + "time_per_iteration": 2.698610782623291 + }, + { + "auxiliary_loss_clip": 0.01106291, + "auxiliary_loss_mlp": 0.01084832, + "balance_loss_clip": 1.02376986, + "balance_loss_mlp": 1.00472355, + "epoch": 0.812962183610894, + "flos": 26648446752000.0, + "grad_norm": 1.7613942574860362, + "language_loss": 0.77416027, + "learning_rate": 3.5573606183910163e-07, + "loss": 0.79607153, + "num_input_tokens_seen": 145983150, + "step": 6761, + "time_per_iteration": 2.7789769172668457 + }, + { + "auxiliary_loss_clip": 0.01127058, + "auxiliary_loss_mlp": 0.01084843, + "balance_loss_clip": 1.0252471, + "balance_loss_mlp": 1.00454402, + "epoch": 0.8130824265015331, + "flos": 24966965329920.0, + "grad_norm": 1.6929772114374706, + "language_loss": 0.78523266, + "learning_rate": 3.5529272037856493e-07, + "loss": 0.80735171, + "num_input_tokens_seen": 146001365, + "step": 6762, + "time_per_iteration": 2.7312376499176025 + }, + { + "auxiliary_loss_clip": 0.01073525, + "auxiliary_loss_mlp": 0.01078921, + "balance_loss_clip": 1.01802874, + "balance_loss_mlp": 0.99995679, + "epoch": 0.8132026693921722, + "flos": 67622918175360.0, + "grad_norm": 0.7351572592313304, + "language_loss": 0.53816473, + "learning_rate": 3.548496284188149e-07, + "loss": 0.55968916, + "num_input_tokens_seen": 146061570, + "step": 6763, + "time_per_iteration": 3.4892265796661377 + }, + { + "auxiliary_loss_clip": 0.01088216, + "auxiliary_loss_mlp": 0.01083749, + "balance_loss_clip": 1.02191615, + "balance_loss_mlp": 1.00368774, + "epoch": 0.8133229122828113, + "flos": 19495149045120.0, + "grad_norm": 1.755379337578301, + "language_loss": 0.79152286, + "learning_rate": 3.544067860270681e-07, + "loss": 0.81324244, + "num_input_tokens_seen": 146079145, + "step": 6764, + "time_per_iteration": 2.7856340408325195 + }, + { + "auxiliary_loss_clip": 0.01101841, + "auxiliary_loss_mlp": 0.01084311, + "balance_loss_clip": 1.02341878, + "balance_loss_mlp": 1.00410688, + "epoch": 0.8134431551734503, + "flos": 20668135582080.0, + "grad_norm": 1.6809035811565027, + "language_loss": 0.71221799, + "learning_rate": 3.539641932705029e-07, + "loss": 0.73407948, + "num_input_tokens_seen": 146097625, + "step": 6765, + "time_per_iteration": 2.790210485458374 + }, + { + "auxiliary_loss_clip": 0.01135701, + "auxiliary_loss_mlp": 0.01085064, + "balance_loss_clip": 1.02589738, + "balance_loss_mlp": 1.00486016, + "epoch": 0.8135633980640895, + "flos": 21507332008320.0, + "grad_norm": 2.3414296393433265, + "language_loss": 0.77038699, + "learning_rate": 3.53521850216262e-07, + "loss": 0.79259461, + "num_input_tokens_seen": 146117195, + "step": 6766, + "time_per_iteration": 2.6386005878448486 + }, + { + "auxiliary_loss_clip": 0.01135665, + "auxiliary_loss_mlp": 0.01083503, + "balance_loss_clip": 1.02639043, + "balance_loss_mlp": 1.00334668, + "epoch": 0.8136836409547286, + "flos": 20554442058240.0, + "grad_norm": 1.682557303876185, + "language_loss": 0.76882446, + "learning_rate": 3.530797569314461e-07, + "loss": 0.7910161, + "num_input_tokens_seen": 146136220, + "step": 6767, + "time_per_iteration": 2.6318798065185547 + }, + { + "auxiliary_loss_clip": 0.01135485, + "auxiliary_loss_mlp": 0.01084453, + "balance_loss_clip": 1.02633047, + "balance_loss_mlp": 1.00424862, + "epoch": 0.8138038838453676, + "flos": 20299045380480.0, + "grad_norm": 3.7624179298836036, + "language_loss": 0.77969766, + "learning_rate": 3.5263791348312235e-07, + "loss": 0.80189705, + "num_input_tokens_seen": 146155415, + "step": 6768, + "time_per_iteration": 2.6307384967803955 + }, + { + "auxiliary_loss_clip": 0.01109599, + "auxiliary_loss_mlp": 0.01083354, + "balance_loss_clip": 1.02372408, + "balance_loss_mlp": 1.00324512, + "epoch": 0.8139241267360068, + "flos": 29789840551680.0, + "grad_norm": 1.7830042544343605, + "language_loss": 0.70361173, + "learning_rate": 3.521963199383171e-07, + "loss": 0.72554123, + "num_input_tokens_seen": 146178370, + "step": 6769, + "time_per_iteration": 2.787233829498291 + }, + { + "auxiliary_loss_clip": 0.01097364, + "auxiliary_loss_mlp": 0.01084286, + "balance_loss_clip": 1.02260733, + "balance_loss_mlp": 1.00408173, + "epoch": 0.8140443696266458, + "flos": 19713270384000.0, + "grad_norm": 1.900874904783541, + "language_loss": 0.76671171, + "learning_rate": 3.517549763640197e-07, + "loss": 0.78852826, + "num_input_tokens_seen": 146196010, + "step": 6770, + "time_per_iteration": 2.765993356704712 + }, + { + "auxiliary_loss_clip": 0.01125429, + "auxiliary_loss_mlp": 0.00872794, + "balance_loss_clip": 1.02602041, + "balance_loss_mlp": 1.00016356, + "epoch": 0.8141646125172849, + "flos": 27160568910720.0, + "grad_norm": 1.7590556789989236, + "language_loss": 0.71033323, + "learning_rate": 3.513138828271829e-07, + "loss": 0.73031545, + "num_input_tokens_seen": 146215880, + "step": 6771, + "time_per_iteration": 2.746506929397583 + }, + { + "auxiliary_loss_clip": 0.01101515, + "auxiliary_loss_mlp": 0.01084548, + "balance_loss_clip": 1.02029061, + "balance_loss_mlp": 1.00453472, + "epoch": 0.8142848554079241, + "flos": 39673102700160.0, + "grad_norm": 3.9586178757449706, + "language_loss": 0.70091408, + "learning_rate": 3.508730393947179e-07, + "loss": 0.72277474, + "num_input_tokens_seen": 146239135, + "step": 6772, + "time_per_iteration": 2.941924571990967 + }, + { + "auxiliary_loss_clip": 0.01106493, + "auxiliary_loss_mlp": 0.01085193, + "balance_loss_clip": 1.02487147, + "balance_loss_mlp": 1.00494134, + "epoch": 0.8144050982985631, + "flos": 22237288197120.0, + "grad_norm": 2.430888535870174, + "language_loss": 0.72334278, + "learning_rate": 3.504324461335024e-07, + "loss": 0.74525964, + "num_input_tokens_seen": 146259245, + "step": 6773, + "time_per_iteration": 2.774852991104126 + }, + { + "auxiliary_loss_clip": 0.01095133, + "auxiliary_loss_mlp": 0.01083959, + "balance_loss_clip": 1.02044439, + "balance_loss_mlp": 1.00385034, + "epoch": 0.8145253411892022, + "flos": 23038239617280.0, + "grad_norm": 1.5587824458066903, + "language_loss": 0.88143325, + "learning_rate": 3.499921031103732e-07, + "loss": 0.90322417, + "num_input_tokens_seen": 146280015, + "step": 6774, + "time_per_iteration": 2.8918726444244385 + }, + { + "auxiliary_loss_clip": 0.01094957, + "auxiliary_loss_mlp": 0.01084274, + "balance_loss_clip": 1.02565253, + "balance_loss_mlp": 1.00407004, + "epoch": 0.8146455840798413, + "flos": 24827668387200.0, + "grad_norm": 1.5477275194273286, + "language_loss": 0.78299594, + "learning_rate": 3.4955201039212987e-07, + "loss": 0.80478823, + "num_input_tokens_seen": 146300935, + "step": 6775, + "time_per_iteration": 2.793750524520874 + }, + { + "auxiliary_loss_clip": 0.01110617, + "auxiliary_loss_mlp": 0.01084561, + "balance_loss_clip": 1.02597964, + "balance_loss_mlp": 1.00435686, + "epoch": 0.8147658269704804, + "flos": 19974520978560.0, + "grad_norm": 3.258326720657862, + "language_loss": 0.65155423, + "learning_rate": 3.4911216804553465e-07, + "loss": 0.67350596, + "num_input_tokens_seen": 146319835, + "step": 6776, + "time_per_iteration": 4.586642265319824 + }, + { + "auxiliary_loss_clip": 0.01116535, + "auxiliary_loss_mlp": 0.01083776, + "balance_loss_clip": 1.02467418, + "balance_loss_mlp": 1.00352442, + "epoch": 0.8148860698611194, + "flos": 21178031097600.0, + "grad_norm": 2.2704671656266973, + "language_loss": 0.70705855, + "learning_rate": 3.4867257613731017e-07, + "loss": 0.72906166, + "num_input_tokens_seen": 146339030, + "step": 6777, + "time_per_iteration": 2.733383893966675 + }, + { + "auxiliary_loss_clip": 0.01117151, + "auxiliary_loss_mlp": 0.01083577, + "balance_loss_clip": 1.02491045, + "balance_loss_mlp": 1.00346863, + "epoch": 0.8150063127517585, + "flos": 19606903234560.0, + "grad_norm": 1.6338688985726137, + "language_loss": 0.85609108, + "learning_rate": 3.4823323473414343e-07, + "loss": 0.87809837, + "num_input_tokens_seen": 146358550, + "step": 6778, + "time_per_iteration": 2.692171335220337 + }, + { + "auxiliary_loss_clip": 0.01086653, + "auxiliary_loss_mlp": 0.01083768, + "balance_loss_clip": 1.02461267, + "balance_loss_mlp": 1.00342059, + "epoch": 0.8151265556423977, + "flos": 22638374438400.0, + "grad_norm": 2.05245902763558, + "language_loss": 0.76060355, + "learning_rate": 3.477941439026812e-07, + "loss": 0.78230774, + "num_input_tokens_seen": 146376770, + "step": 6779, + "time_per_iteration": 3.597317695617676 + }, + { + "auxiliary_loss_clip": 0.01100362, + "auxiliary_loss_mlp": 0.01083841, + "balance_loss_clip": 1.02617443, + "balance_loss_mlp": 1.00378013, + "epoch": 0.8152467985330367, + "flos": 17968048277760.0, + "grad_norm": 1.7042197714752954, + "language_loss": 0.7318117, + "learning_rate": 3.473553037095349e-07, + "loss": 0.75365371, + "num_input_tokens_seen": 146395795, + "step": 6780, + "time_per_iteration": 2.663648843765259 + }, + { + "auxiliary_loss_clip": 0.01119507, + "auxiliary_loss_mlp": 0.01084838, + "balance_loss_clip": 1.02662134, + "balance_loss_mlp": 1.00477719, + "epoch": 0.8153670414236758, + "flos": 24969012405120.0, + "grad_norm": 1.879048570120424, + "language_loss": 0.83068162, + "learning_rate": 3.469167142212743e-07, + "loss": 0.85272503, + "num_input_tokens_seen": 146417640, + "step": 6781, + "time_per_iteration": 3.6993162631988525 + }, + { + "auxiliary_loss_clip": 0.01124573, + "auxiliary_loss_mlp": 0.01084096, + "balance_loss_clip": 1.02457952, + "balance_loss_mlp": 1.00393915, + "epoch": 0.8154872843143149, + "flos": 31066069754880.0, + "grad_norm": 2.8011633338602646, + "language_loss": 0.63260758, + "learning_rate": 3.4647837550443337e-07, + "loss": 0.6546942, + "num_input_tokens_seen": 146436205, + "step": 6782, + "time_per_iteration": 2.7197492122650146 + }, + { + "auxiliary_loss_clip": 0.01103887, + "auxiliary_loss_mlp": 0.01083797, + "balance_loss_clip": 1.02183759, + "balance_loss_mlp": 1.00364065, + "epoch": 0.815607527204954, + "flos": 19391654983680.0, + "grad_norm": 1.9988605858558197, + "language_loss": 0.74331844, + "learning_rate": 3.460402876255086e-07, + "loss": 0.76519531, + "num_input_tokens_seen": 146453595, + "step": 6783, + "time_per_iteration": 2.6802713871002197 + }, + { + "auxiliary_loss_clip": 0.01127863, + "auxiliary_loss_mlp": 0.01083909, + "balance_loss_clip": 1.02664053, + "balance_loss_mlp": 1.00384796, + "epoch": 0.815727770095593, + "flos": 26140418743680.0, + "grad_norm": 2.0078076512867358, + "language_loss": 0.71882242, + "learning_rate": 3.456024506509574e-07, + "loss": 0.74094009, + "num_input_tokens_seen": 146474515, + "step": 6784, + "time_per_iteration": 2.713477611541748 + }, + { + "auxiliary_loss_clip": 0.01126114, + "auxiliary_loss_mlp": 0.0087294, + "balance_loss_clip": 1.02599204, + "balance_loss_mlp": 1.00010395, + "epoch": 0.8158480129862322, + "flos": 25337527989120.0, + "grad_norm": 1.5640987660311303, + "language_loss": 0.73930955, + "learning_rate": 3.4516486464719873e-07, + "loss": 0.75930011, + "num_input_tokens_seen": 146493905, + "step": 6785, + "time_per_iteration": 2.7405638694763184 + }, + { + "auxiliary_loss_clip": 0.01099833, + "auxiliary_loss_mlp": 0.01083264, + "balance_loss_clip": 1.02483678, + "balance_loss_mlp": 1.00315499, + "epoch": 0.8159682558768713, + "flos": 34423645559040.0, + "grad_norm": 2.4915156402211966, + "language_loss": 0.62161314, + "learning_rate": 3.4472752968061445e-07, + "loss": 0.64344412, + "num_input_tokens_seen": 146518335, + "step": 6786, + "time_per_iteration": 2.858590841293335 + }, + { + "auxiliary_loss_clip": 0.01124831, + "auxiliary_loss_mlp": 0.01084291, + "balance_loss_clip": 1.02400386, + "balance_loss_mlp": 1.00413489, + "epoch": 0.8160884987675103, + "flos": 18653223185280.0, + "grad_norm": 1.7927172893745753, + "language_loss": 0.73783988, + "learning_rate": 3.442904458175475e-07, + "loss": 0.75993115, + "num_input_tokens_seen": 146535655, + "step": 6787, + "time_per_iteration": 2.657322645187378 + }, + { + "auxiliary_loss_clip": 0.01127198, + "auxiliary_loss_mlp": 0.01083642, + "balance_loss_clip": 1.02551472, + "balance_loss_mlp": 1.00353336, + "epoch": 0.8162087416581495, + "flos": 31430527102080.0, + "grad_norm": 1.4743545952708559, + "language_loss": 0.75958657, + "learning_rate": 3.438536131243044e-07, + "loss": 0.78169495, + "num_input_tokens_seen": 146556815, + "step": 6788, + "time_per_iteration": 2.760249614715576 + }, + { + "auxiliary_loss_clip": 0.01115956, + "auxiliary_loss_mlp": 0.01083252, + "balance_loss_clip": 1.02452278, + "balance_loss_mlp": 1.00299978, + "epoch": 0.8163289845487885, + "flos": 37593910915200.0, + "grad_norm": 2.209774041790352, + "language_loss": 0.61689425, + "learning_rate": 3.434170316671503e-07, + "loss": 0.63888633, + "num_input_tokens_seen": 146581845, + "step": 6789, + "time_per_iteration": 2.8960094451904297 + }, + { + "auxiliary_loss_clip": 0.01106332, + "auxiliary_loss_mlp": 0.01082594, + "balance_loss_clip": 1.02499175, + "balance_loss_mlp": 1.00258029, + "epoch": 0.8164492274394276, + "flos": 13953989554560.0, + "grad_norm": 2.209938493736442, + "language_loss": 0.89440966, + "learning_rate": 3.4298070151231583e-07, + "loss": 0.91629887, + "num_input_tokens_seen": 146597245, + "step": 6790, + "time_per_iteration": 2.771289110183716 + }, + { + "auxiliary_loss_clip": 0.0111704, + "auxiliary_loss_mlp": 0.01083454, + "balance_loss_clip": 1.02456641, + "balance_loss_mlp": 1.00334525, + "epoch": 0.8165694703300668, + "flos": 28986554747520.0, + "grad_norm": 2.1282078665841917, + "language_loss": 0.60235494, + "learning_rate": 3.425446227259916e-07, + "loss": 0.62435997, + "num_input_tokens_seen": 146618210, + "step": 6791, + "time_per_iteration": 2.96622371673584 + }, + { + "auxiliary_loss_clip": 0.01115575, + "auxiliary_loss_mlp": 0.01084676, + "balance_loss_clip": 1.02388632, + "balance_loss_mlp": 1.00452018, + "epoch": 0.8166897132207058, + "flos": 25118365155840.0, + "grad_norm": 2.725052237410598, + "language_loss": 0.82629633, + "learning_rate": 3.421087953743296e-07, + "loss": 0.84829891, + "num_input_tokens_seen": 146637975, + "step": 6792, + "time_per_iteration": 2.723393201828003 + }, + { + "auxiliary_loss_clip": 0.01126708, + "auxiliary_loss_mlp": 0.01084858, + "balance_loss_clip": 1.02535868, + "balance_loss_mlp": 1.00465393, + "epoch": 0.8168099561113449, + "flos": 23148593176320.0, + "grad_norm": 1.975517528144148, + "language_loss": 0.80140233, + "learning_rate": 3.416732195234464e-07, + "loss": 0.82351798, + "num_input_tokens_seen": 146658030, + "step": 6793, + "time_per_iteration": 2.729818344116211 + }, + { + "auxiliary_loss_clip": 0.01126698, + "auxiliary_loss_mlp": 0.01083546, + "balance_loss_clip": 1.0253402, + "balance_loss_mlp": 1.00343716, + "epoch": 0.816930199001984, + "flos": 18407666833920.0, + "grad_norm": 1.45870149979427, + "language_loss": 0.79343271, + "learning_rate": 3.4123789523941613e-07, + "loss": 0.81553519, + "num_input_tokens_seen": 146677855, + "step": 6794, + "time_per_iteration": 2.649740219116211 + }, + { + "auxiliary_loss_clip": 0.01126576, + "auxiliary_loss_mlp": 0.01084175, + "balance_loss_clip": 1.02513289, + "balance_loss_mlp": 1.00406647, + "epoch": 0.8170504418926231, + "flos": 21251324799360.0, + "grad_norm": 1.7100205019405406, + "language_loss": 0.63292813, + "learning_rate": 3.4080282258827884e-07, + "loss": 0.65503556, + "num_input_tokens_seen": 146696230, + "step": 6795, + "time_per_iteration": 2.6923153400421143 + }, + { + "auxiliary_loss_clip": 0.01127052, + "auxiliary_loss_mlp": 0.01084158, + "balance_loss_clip": 1.02579999, + "balance_loss_mlp": 1.00404954, + "epoch": 0.8171706847832622, + "flos": 19099234362240.0, + "grad_norm": 1.992361265910157, + "language_loss": 0.72583461, + "learning_rate": 3.403680016360342e-07, + "loss": 0.74794674, + "num_input_tokens_seen": 146714835, + "step": 6796, + "time_per_iteration": 2.670722007751465 + }, + { + "auxiliary_loss_clip": 0.01129035, + "auxiliary_loss_mlp": 0.0108424, + "balance_loss_clip": 1.02776289, + "balance_loss_mlp": 1.00408316, + "epoch": 0.8172909276739013, + "flos": 21470128496640.0, + "grad_norm": 1.518660992688096, + "language_loss": 0.67714065, + "learning_rate": 3.3993343244864403e-07, + "loss": 0.69927335, + "num_input_tokens_seen": 146734425, + "step": 6797, + "time_per_iteration": 2.696748971939087 + }, + { + "auxiliary_loss_clip": 0.01124008, + "auxiliary_loss_mlp": 0.01084569, + "balance_loss_clip": 1.02380443, + "balance_loss_mlp": 1.0044601, + "epoch": 0.8174111705645404, + "flos": 27599792417280.0, + "grad_norm": 4.399184113352718, + "language_loss": 0.72647798, + "learning_rate": 3.394991150920323e-07, + "loss": 0.74856377, + "num_input_tokens_seen": 146757545, + "step": 6798, + "time_per_iteration": 2.7525789737701416 + }, + { + "auxiliary_loss_clip": 0.01099481, + "auxiliary_loss_mlp": 0.00873032, + "balance_loss_clip": 1.02377355, + "balance_loss_mlp": 1.00011253, + "epoch": 0.8175314134551794, + "flos": 14064594508800.0, + "grad_norm": 1.851259810141427, + "language_loss": 0.74468672, + "learning_rate": 3.3906504963208396e-07, + "loss": 0.76441187, + "num_input_tokens_seen": 146774240, + "step": 6799, + "time_per_iteration": 2.844212055206299 + }, + { + "auxiliary_loss_clip": 0.01094153, + "auxiliary_loss_mlp": 0.01084385, + "balance_loss_clip": 1.02219081, + "balance_loss_mlp": 1.00422859, + "epoch": 0.8176516563458186, + "flos": 22708076780160.0, + "grad_norm": 2.0203357948175515, + "language_loss": 0.66753066, + "learning_rate": 3.3863123613464774e-07, + "loss": 0.68931609, + "num_input_tokens_seen": 146793140, + "step": 6800, + "time_per_iteration": 3.6607799530029297 + }, + { + "auxiliary_loss_clip": 0.01117257, + "auxiliary_loss_mlp": 0.01085667, + "balance_loss_clip": 1.02359068, + "balance_loss_mlp": 1.00546324, + "epoch": 0.8177718992364577, + "flos": 21945406279680.0, + "grad_norm": 2.0331615641221563, + "language_loss": 0.74908304, + "learning_rate": 3.381976746655317e-07, + "loss": 0.77111226, + "num_input_tokens_seen": 146812895, + "step": 6801, + "time_per_iteration": 3.5669212341308594 + }, + { + "auxiliary_loss_clip": 0.01088065, + "auxiliary_loss_mlp": 0.01083161, + "balance_loss_clip": 1.02165949, + "balance_loss_mlp": 1.00309992, + "epoch": 0.8178921421270967, + "flos": 22017443005440.0, + "grad_norm": 1.9491460915067755, + "language_loss": 0.67304826, + "learning_rate": 3.3776436529050756e-07, + "loss": 0.69476044, + "num_input_tokens_seen": 146832445, + "step": 6802, + "time_per_iteration": 2.675492286682129 + }, + { + "auxiliary_loss_clip": 0.01134939, + "auxiliary_loss_mlp": 0.01084967, + "balance_loss_clip": 1.02577114, + "balance_loss_mlp": 1.00481033, + "epoch": 0.8180123850177359, + "flos": 33183111496320.0, + "grad_norm": 1.9302680111224328, + "language_loss": 0.72411585, + "learning_rate": 3.373313080753073e-07, + "loss": 0.74631488, + "num_input_tokens_seen": 146856505, + "step": 6803, + "time_per_iteration": 2.6466853618621826 + }, + { + "auxiliary_loss_clip": 0.01127247, + "auxiliary_loss_mlp": 0.01083459, + "balance_loss_clip": 1.02570772, + "balance_loss_mlp": 1.00330293, + "epoch": 0.8181326279083749, + "flos": 22091167670400.0, + "grad_norm": 1.5256542473327332, + "language_loss": 0.77616239, + "learning_rate": 3.3689850308562527e-07, + "loss": 0.79826951, + "num_input_tokens_seen": 146876950, + "step": 6804, + "time_per_iteration": 2.5299534797668457 + }, + { + "auxiliary_loss_clip": 0.01094019, + "auxiliary_loss_mlp": 0.01083594, + "balance_loss_clip": 1.02146614, + "balance_loss_mlp": 1.00358105, + "epoch": 0.818252870799014, + "flos": 15705747936000.0, + "grad_norm": 2.10776756492308, + "language_loss": 0.77613628, + "learning_rate": 3.364659503871183e-07, + "loss": 0.79791236, + "num_input_tokens_seen": 146894885, + "step": 6805, + "time_per_iteration": 3.499674081802368 + }, + { + "auxiliary_loss_clip": 0.01107243, + "auxiliary_loss_mlp": 0.01084986, + "balance_loss_clip": 1.02280402, + "balance_loss_mlp": 1.00487697, + "epoch": 0.8183731136896532, + "flos": 18770687637120.0, + "grad_norm": 1.8573556351478147, + "language_loss": 0.83565128, + "learning_rate": 3.3603365004540417e-07, + "loss": 0.85757351, + "num_input_tokens_seen": 146913180, + "step": 6806, + "time_per_iteration": 2.69936203956604 + }, + { + "auxiliary_loss_clip": 0.01136789, + "auxiliary_loss_mlp": 0.01084185, + "balance_loss_clip": 1.02744722, + "balance_loss_mlp": 1.00407648, + "epoch": 0.8184933565802922, + "flos": 26541792293760.0, + "grad_norm": 1.788879136895299, + "language_loss": 0.77123785, + "learning_rate": 3.356016021260624e-07, + "loss": 0.79344755, + "num_input_tokens_seen": 146933510, + "step": 6807, + "time_per_iteration": 3.5681869983673096 + }, + { + "auxiliary_loss_clip": 0.01124931, + "auxiliary_loss_mlp": 0.01084813, + "balance_loss_clip": 1.02505481, + "balance_loss_mlp": 1.00460947, + "epoch": 0.8186135994709313, + "flos": 17530117660800.0, + "grad_norm": 8.413779297422174, + "language_loss": 0.66275185, + "learning_rate": 3.35169806694634e-07, + "loss": 0.68484926, + "num_input_tokens_seen": 146951760, + "step": 6808, + "time_per_iteration": 2.6605348587036133 + }, + { + "auxiliary_loss_clip": 0.01084989, + "auxiliary_loss_mlp": 0.01079054, + "balance_loss_clip": 1.02290678, + "balance_loss_mlp": 1.00008965, + "epoch": 0.8187338423615703, + "flos": 63480300675840.0, + "grad_norm": 0.7101502259005457, + "language_loss": 0.60671788, + "learning_rate": 3.3473826381662186e-07, + "loss": 0.6283583, + "num_input_tokens_seen": 147022900, + "step": 6809, + "time_per_iteration": 3.4092094898223877 + }, + { + "auxiliary_loss_clip": 0.01119799, + "auxiliary_loss_mlp": 0.01083373, + "balance_loss_clip": 1.02559066, + "balance_loss_mlp": 1.00331235, + "epoch": 0.8188540852522095, + "flos": 17529974006400.0, + "grad_norm": 2.6348033053004953, + "language_loss": 0.81788749, + "learning_rate": 3.3430697355749216e-07, + "loss": 0.83991921, + "num_input_tokens_seen": 147040590, + "step": 6810, + "time_per_iteration": 2.630504846572876 + }, + { + "auxiliary_loss_clip": 0.01098594, + "auxiliary_loss_mlp": 0.01084085, + "balance_loss_clip": 1.02429688, + "balance_loss_mlp": 1.00397658, + "epoch": 0.8189743281428485, + "flos": 14392530702720.0, + "grad_norm": 2.001372222413993, + "language_loss": 0.75268549, + "learning_rate": 3.3387593598266907e-07, + "loss": 0.77451229, + "num_input_tokens_seen": 147057200, + "step": 6811, + "time_per_iteration": 2.7708632946014404 + }, + { + "auxiliary_loss_clip": 0.01109592, + "auxiliary_loss_mlp": 0.01083388, + "balance_loss_clip": 1.02454984, + "balance_loss_mlp": 1.00318384, + "epoch": 0.8190945710334876, + "flos": 25080479285760.0, + "grad_norm": 2.812962551499838, + "language_loss": 0.78149617, + "learning_rate": 3.3344515115754225e-07, + "loss": 0.80342603, + "num_input_tokens_seen": 147076180, + "step": 6812, + "time_per_iteration": 2.846414566040039 + }, + { + "auxiliary_loss_clip": 0.01092959, + "auxiliary_loss_mlp": 0.01085025, + "balance_loss_clip": 1.02487993, + "balance_loss_mlp": 1.00491643, + "epoch": 0.8192148139241268, + "flos": 21507152440320.0, + "grad_norm": 2.4792523422206134, + "language_loss": 0.79976654, + "learning_rate": 3.33014619147461e-07, + "loss": 0.82154644, + "num_input_tokens_seen": 147094205, + "step": 6813, + "time_per_iteration": 2.7494170665740967 + }, + { + "auxiliary_loss_clip": 0.01115243, + "auxiliary_loss_mlp": 0.0108457, + "balance_loss_clip": 1.02447844, + "balance_loss_mlp": 1.0043664, + "epoch": 0.8193350568147658, + "flos": 23952166289280.0, + "grad_norm": 1.8907117330250762, + "language_loss": 0.71672839, + "learning_rate": 3.325843400177362e-07, + "loss": 0.7387265, + "num_input_tokens_seen": 147115545, + "step": 6814, + "time_per_iteration": 2.7087578773498535 + }, + { + "auxiliary_loss_clip": 0.01128714, + "auxiliary_loss_mlp": 0.00872897, + "balance_loss_clip": 1.02694368, + "balance_loss_mlp": 1.00007904, + "epoch": 0.8194552997054049, + "flos": 20559469962240.0, + "grad_norm": 1.840127972281474, + "language_loss": 0.73702091, + "learning_rate": 3.32154313833642e-07, + "loss": 0.75703704, + "num_input_tokens_seen": 147135700, + "step": 6815, + "time_per_iteration": 2.6774885654449463 + }, + { + "auxiliary_loss_clip": 0.01135948, + "auxiliary_loss_mlp": 0.01083879, + "balance_loss_clip": 1.02664733, + "balance_loss_mlp": 1.00372255, + "epoch": 0.819575542596044, + "flos": 26031753123840.0, + "grad_norm": 2.1332860647490515, + "language_loss": 0.59564131, + "learning_rate": 3.3172454066041164e-07, + "loss": 0.61783957, + "num_input_tokens_seen": 147155205, + "step": 6816, + "time_per_iteration": 2.6826601028442383 + }, + { + "auxiliary_loss_clip": 0.01081812, + "auxiliary_loss_mlp": 0.0087281, + "balance_loss_clip": 1.01896131, + "balance_loss_mlp": 1.00014114, + "epoch": 0.8196957854866831, + "flos": 29096944220160.0, + "grad_norm": 1.7537750516955843, + "language_loss": 0.76339662, + "learning_rate": 3.3129502056324234e-07, + "loss": 0.78294283, + "num_input_tokens_seen": 147176570, + "step": 6817, + "time_per_iteration": 2.962571144104004 + }, + { + "auxiliary_loss_clip": 0.01060786, + "auxiliary_loss_mlp": 0.01078992, + "balance_loss_clip": 1.01365113, + "balance_loss_mlp": 1.00002789, + "epoch": 0.8198160283773221, + "flos": 69033631898880.0, + "grad_norm": 0.796000718511678, + "language_loss": 0.59771895, + "learning_rate": 3.3086575360729165e-07, + "loss": 0.61911672, + "num_input_tokens_seen": 147234105, + "step": 6818, + "time_per_iteration": 3.3780386447906494 + }, + { + "auxiliary_loss_clip": 0.01117479, + "auxiliary_loss_mlp": 0.01084408, + "balance_loss_clip": 1.02520549, + "balance_loss_mlp": 1.0042038, + "epoch": 0.8199362712679613, + "flos": 16618058496000.0, + "grad_norm": 1.6451669886198084, + "language_loss": 0.71328485, + "learning_rate": 3.3043673985767906e-07, + "loss": 0.73530376, + "num_input_tokens_seen": 147253170, + "step": 6819, + "time_per_iteration": 2.879244327545166 + }, + { + "auxiliary_loss_clip": 0.01108427, + "auxiliary_loss_mlp": 0.0108493, + "balance_loss_clip": 1.02422309, + "balance_loss_mlp": 1.00467801, + "epoch": 0.8200565141586004, + "flos": 21757664868480.0, + "grad_norm": 1.6652958427204028, + "language_loss": 0.77887845, + "learning_rate": 3.3000797937948564e-07, + "loss": 0.80081201, + "num_input_tokens_seen": 147271465, + "step": 6820, + "time_per_iteration": 2.710754871368408 + }, + { + "auxiliary_loss_clip": 0.01070407, + "auxiliary_loss_mlp": 0.01078852, + "balance_loss_clip": 1.01687098, + "balance_loss_mlp": 0.99988824, + "epoch": 0.8201767570492394, + "flos": 69807112392960.0, + "grad_norm": 0.941195538958667, + "language_loss": 0.65073919, + "learning_rate": 3.295794722377534e-07, + "loss": 0.67223173, + "num_input_tokens_seen": 147335070, + "step": 6821, + "time_per_iteration": 3.35029673576355 + }, + { + "auxiliary_loss_clip": 0.01134045, + "auxiliary_loss_mlp": 0.01084027, + "balance_loss_clip": 1.02497745, + "balance_loss_mlp": 1.00391817, + "epoch": 0.8202969999398786, + "flos": 23111892455040.0, + "grad_norm": 1.4300479913915607, + "language_loss": 0.79776323, + "learning_rate": 3.291512184974876e-07, + "loss": 0.8199439, + "num_input_tokens_seen": 147355460, + "step": 6822, + "time_per_iteration": 2.6624503135681152 + }, + { + "auxiliary_loss_clip": 0.01118862, + "auxiliary_loss_mlp": 0.01084725, + "balance_loss_clip": 1.02552509, + "balance_loss_mlp": 1.00452125, + "epoch": 0.8204172428305176, + "flos": 28220616109440.0, + "grad_norm": 1.6699664811494397, + "language_loss": 0.66283739, + "learning_rate": 3.2872321822365346e-07, + "loss": 0.68487322, + "num_input_tokens_seen": 147375675, + "step": 6823, + "time_per_iteration": 2.7203264236450195 + }, + { + "auxiliary_loss_clip": 0.0112524, + "auxiliary_loss_mlp": 0.01083613, + "balance_loss_clip": 1.02535367, + "balance_loss_mlp": 1.00355244, + "epoch": 0.8205374857211567, + "flos": 20887011106560.0, + "grad_norm": 1.6879120741713114, + "language_loss": 0.73218429, + "learning_rate": 3.282954714811783e-07, + "loss": 0.75427282, + "num_input_tokens_seen": 147394580, + "step": 6824, + "time_per_iteration": 2.8044114112854004 + }, + { + "auxiliary_loss_clip": 0.01117889, + "auxiliary_loss_mlp": 0.01084117, + "balance_loss_clip": 1.02434051, + "balance_loss_mlp": 1.00396073, + "epoch": 0.8206577286117959, + "flos": 13152140294400.0, + "grad_norm": 2.0034988046941913, + "language_loss": 0.71058846, + "learning_rate": 3.2786797833495093e-07, + "loss": 0.7326085, + "num_input_tokens_seen": 147409935, + "step": 6825, + "time_per_iteration": 3.523137092590332 + }, + { + "auxiliary_loss_clip": 0.01134583, + "auxiliary_loss_mlp": 0.01083165, + "balance_loss_clip": 1.02544212, + "balance_loss_mlp": 1.00310373, + "epoch": 0.8207779715024349, + "flos": 25265634917760.0, + "grad_norm": 1.7700247489858734, + "language_loss": 0.7249012, + "learning_rate": 3.274407388498213e-07, + "loss": 0.74707878, + "num_input_tokens_seen": 147428065, + "step": 6826, + "time_per_iteration": 3.552582263946533 + }, + { + "auxiliary_loss_clip": 0.01107574, + "auxiliary_loss_mlp": 0.01084553, + "balance_loss_clip": 1.0236553, + "balance_loss_mlp": 1.00444436, + "epoch": 0.820898214393074, + "flos": 19610243199360.0, + "grad_norm": 3.5421842692817327, + "language_loss": 0.73919582, + "learning_rate": 3.270137530906021e-07, + "loss": 0.7611171, + "num_input_tokens_seen": 147447300, + "step": 6827, + "time_per_iteration": 2.7792091369628906 + }, + { + "auxiliary_loss_clip": 0.0108917, + "auxiliary_loss_mlp": 0.01083147, + "balance_loss_clip": 1.01878333, + "balance_loss_mlp": 1.00318193, + "epoch": 0.8210184572837131, + "flos": 15596615439360.0, + "grad_norm": 1.6808508711912573, + "language_loss": 0.83512247, + "learning_rate": 3.265870211220665e-07, + "loss": 0.85684568, + "num_input_tokens_seen": 147465135, + "step": 6828, + "time_per_iteration": 2.808530807495117 + }, + { + "auxiliary_loss_clip": 0.01101242, + "auxiliary_loss_mlp": 0.01085717, + "balance_loss_clip": 1.01963377, + "balance_loss_mlp": 1.00541735, + "epoch": 0.8211387001743522, + "flos": 20813932886400.0, + "grad_norm": 2.750040022397246, + "language_loss": 0.81657755, + "learning_rate": 3.2616054300894934e-07, + "loss": 0.83844709, + "num_input_tokens_seen": 147484585, + "step": 6829, + "time_per_iteration": 2.797598361968994 + }, + { + "auxiliary_loss_clip": 0.01111219, + "auxiliary_loss_mlp": 0.01084513, + "balance_loss_clip": 1.02106953, + "balance_loss_mlp": 1.00430918, + "epoch": 0.8212589430649913, + "flos": 27704579368320.0, + "grad_norm": 1.9684786148743425, + "language_loss": 0.8451581, + "learning_rate": 3.2573431881594693e-07, + "loss": 0.86711538, + "num_input_tokens_seen": 147504130, + "step": 6830, + "time_per_iteration": 3.8324294090270996 + }, + { + "auxiliary_loss_clip": 0.01086265, + "auxiliary_loss_mlp": 0.01084401, + "balance_loss_clip": 1.02001512, + "balance_loss_mlp": 1.00419688, + "epoch": 0.8213791859556304, + "flos": 22455625017600.0, + "grad_norm": 2.1138493421045768, + "language_loss": 0.65805829, + "learning_rate": 3.2530834860771663e-07, + "loss": 0.67976499, + "num_input_tokens_seen": 147523510, + "step": 6831, + "time_per_iteration": 2.8569529056549072 + }, + { + "auxiliary_loss_clip": 0.01127686, + "auxiliary_loss_mlp": 0.01082929, + "balance_loss_clip": 1.02634335, + "balance_loss_mlp": 1.00286806, + "epoch": 0.8214994288462695, + "flos": 16654471908480.0, + "grad_norm": 2.1330403142065313, + "language_loss": 0.74254429, + "learning_rate": 3.248826324488794e-07, + "loss": 0.76465046, + "num_input_tokens_seen": 147540805, + "step": 6832, + "time_per_iteration": 2.6738409996032715 + }, + { + "auxiliary_loss_clip": 0.01137185, + "auxiliary_loss_mlp": 0.01084015, + "balance_loss_clip": 1.02790439, + "balance_loss_mlp": 1.00400162, + "epoch": 0.8216196717369085, + "flos": 25221787390080.0, + "grad_norm": 2.5627161756481165, + "language_loss": 0.87793887, + "learning_rate": 3.244571704040138e-07, + "loss": 0.90015084, + "num_input_tokens_seen": 147560965, + "step": 6833, + "time_per_iteration": 3.566624641418457 + }, + { + "auxiliary_loss_clip": 0.01127227, + "auxiliary_loss_mlp": 0.01082925, + "balance_loss_clip": 1.02537215, + "balance_loss_mlp": 1.00272119, + "epoch": 0.8217399146275477, + "flos": 25371930240000.0, + "grad_norm": 1.862438305511734, + "language_loss": 0.73663139, + "learning_rate": 3.2403196253766374e-07, + "loss": 0.75873291, + "num_input_tokens_seen": 147580045, + "step": 6834, + "time_per_iteration": 2.7926197052001953 + }, + { + "auxiliary_loss_clip": 0.01128382, + "auxiliary_loss_mlp": 0.01085019, + "balance_loss_clip": 1.02611542, + "balance_loss_mlp": 1.00471985, + "epoch": 0.8218601575181868, + "flos": 25629625388160.0, + "grad_norm": 2.474650681079989, + "language_loss": 0.79041588, + "learning_rate": 3.2360700891433254e-07, + "loss": 0.81254995, + "num_input_tokens_seen": 147599070, + "step": 6835, + "time_per_iteration": 2.700420618057251 + }, + { + "auxiliary_loss_clip": 0.01081335, + "auxiliary_loss_mlp": 0.01079078, + "balance_loss_clip": 1.01916027, + "balance_loss_mlp": 1.00011384, + "epoch": 0.8219804004088258, + "flos": 67660229427840.0, + "grad_norm": 0.7894625485147461, + "language_loss": 0.57297397, + "learning_rate": 3.231823095984847e-07, + "loss": 0.59457815, + "num_input_tokens_seen": 147653710, + "step": 6836, + "time_per_iteration": 3.28006911277771 + }, + { + "auxiliary_loss_clip": 0.01115466, + "auxiliary_loss_mlp": 0.01083956, + "balance_loss_clip": 1.02383518, + "balance_loss_mlp": 1.00389504, + "epoch": 0.822100643299465, + "flos": 19464266327040.0, + "grad_norm": 1.8877282215136044, + "language_loss": 0.76259017, + "learning_rate": 3.2275786465454814e-07, + "loss": 0.78458446, + "num_input_tokens_seen": 147670360, + "step": 6837, + "time_per_iteration": 2.703401803970337 + }, + { + "auxiliary_loss_clip": 0.01108994, + "auxiliary_loss_mlp": 0.01083583, + "balance_loss_clip": 1.02491784, + "balance_loss_mlp": 1.00356984, + "epoch": 0.822220886190104, + "flos": 24681368292480.0, + "grad_norm": 1.6816082360356248, + "language_loss": 0.75585735, + "learning_rate": 3.2233367414690917e-07, + "loss": 0.77778316, + "num_input_tokens_seen": 147692550, + "step": 6838, + "time_per_iteration": 2.7713980674743652 + }, + { + "auxiliary_loss_clip": 0.01107871, + "auxiliary_loss_mlp": 0.01084471, + "balance_loss_clip": 1.024436, + "balance_loss_mlp": 1.00440967, + "epoch": 0.8223411290807431, + "flos": 27819062991360.0, + "grad_norm": 5.688487391291991, + "language_loss": 0.85134888, + "learning_rate": 3.219097381399183e-07, + "loss": 0.8732723, + "num_input_tokens_seen": 147709725, + "step": 6839, + "time_per_iteration": 2.8195810317993164 + }, + { + "auxiliary_loss_clip": 0.01117064, + "auxiliary_loss_mlp": 0.01084107, + "balance_loss_clip": 1.02424896, + "balance_loss_mlp": 1.00399852, + "epoch": 0.8224613719713821, + "flos": 23218546913280.0, + "grad_norm": 1.876264700484011, + "language_loss": 0.81137884, + "learning_rate": 3.2148605669788584e-07, + "loss": 0.83339059, + "num_input_tokens_seen": 147729615, + "step": 6840, + "time_per_iteration": 2.6827239990234375 + }, + { + "auxiliary_loss_clip": 0.01116759, + "auxiliary_loss_mlp": 0.01083262, + "balance_loss_clip": 1.02523685, + "balance_loss_mlp": 1.00315392, + "epoch": 0.8225816148620213, + "flos": 15706250726400.0, + "grad_norm": 3.834345188498575, + "language_loss": 0.77428859, + "learning_rate": 3.2106262988508405e-07, + "loss": 0.79628873, + "num_input_tokens_seen": 147747665, + "step": 6841, + "time_per_iteration": 2.689103364944458 + }, + { + "auxiliary_loss_clip": 0.01114173, + "auxiliary_loss_mlp": 0.01083994, + "balance_loss_clip": 1.02255058, + "balance_loss_mlp": 1.00393367, + "epoch": 0.8227018577526604, + "flos": 18515111391360.0, + "grad_norm": 2.033905101376149, + "language_loss": 0.74036241, + "learning_rate": 3.206394577657465e-07, + "loss": 0.76234412, + "num_input_tokens_seen": 147765445, + "step": 6842, + "time_per_iteration": 2.695988178253174 + }, + { + "auxiliary_loss_clip": 0.01125344, + "auxiliary_loss_mlp": 0.0108408, + "balance_loss_clip": 1.02498865, + "balance_loss_mlp": 1.00382817, + "epoch": 0.8228221006432994, + "flos": 22236785406720.0, + "grad_norm": 2.4855786436745477, + "language_loss": 0.72571516, + "learning_rate": 3.202165404040675e-07, + "loss": 0.74780941, + "num_input_tokens_seen": 147783365, + "step": 6843, + "time_per_iteration": 2.6630594730377197 + }, + { + "auxiliary_loss_clip": 0.01086763, + "auxiliary_loss_mlp": 0.01085173, + "balance_loss_clip": 1.02083445, + "balance_loss_mlp": 1.00496888, + "epoch": 0.8229423435339386, + "flos": 24097532630400.0, + "grad_norm": 1.9946676677905644, + "language_loss": 0.7477994, + "learning_rate": 3.1979387786420396e-07, + "loss": 0.76951873, + "num_input_tokens_seen": 147803605, + "step": 6844, + "time_per_iteration": 2.8922619819641113 + }, + { + "auxiliary_loss_clip": 0.01117975, + "auxiliary_loss_mlp": 0.01083938, + "balance_loss_clip": 1.02485061, + "balance_loss_mlp": 1.0038296, + "epoch": 0.8230625864245776, + "flos": 23878549365120.0, + "grad_norm": 1.8736541390645454, + "language_loss": 0.82429099, + "learning_rate": 3.1937147021027346e-07, + "loss": 0.84631014, + "num_input_tokens_seen": 147822060, + "step": 6845, + "time_per_iteration": 2.7202343940734863 + }, + { + "auxiliary_loss_clip": 0.01125646, + "auxiliary_loss_mlp": 0.01084446, + "balance_loss_clip": 1.02547216, + "balance_loss_mlp": 1.00438464, + "epoch": 0.8231828293152167, + "flos": 16581106379520.0, + "grad_norm": 2.7668089109797847, + "language_loss": 0.76758677, + "learning_rate": 3.189493175063547e-07, + "loss": 0.78968769, + "num_input_tokens_seen": 147839295, + "step": 6846, + "time_per_iteration": 2.664912462234497 + }, + { + "auxiliary_loss_clip": 0.01116881, + "auxiliary_loss_mlp": 0.01084025, + "balance_loss_clip": 1.02560341, + "balance_loss_mlp": 1.00382078, + "epoch": 0.8233030722058559, + "flos": 18880071528960.0, + "grad_norm": 1.747829361622396, + "language_loss": 0.669191, + "learning_rate": 3.1852741981648776e-07, + "loss": 0.69120008, + "num_input_tokens_seen": 147857945, + "step": 6847, + "time_per_iteration": 2.7147328853607178 + }, + { + "auxiliary_loss_clip": 0.01100908, + "auxiliary_loss_mlp": 0.01084185, + "balance_loss_clip": 1.02035451, + "balance_loss_mlp": 1.00412369, + "epoch": 0.8234233150964949, + "flos": 28439024757120.0, + "grad_norm": 2.025614375397343, + "language_loss": 0.69865286, + "learning_rate": 3.1810577720467404e-07, + "loss": 0.72050375, + "num_input_tokens_seen": 147879675, + "step": 6848, + "time_per_iteration": 2.824591875076294 + }, + { + "auxiliary_loss_clip": 0.0111727, + "auxiliary_loss_mlp": 0.01084345, + "balance_loss_clip": 1.02502084, + "balance_loss_mlp": 1.00428438, + "epoch": 0.823543557987134, + "flos": 33765941577600.0, + "grad_norm": 1.4367246189053087, + "language_loss": 0.56206656, + "learning_rate": 3.176843897348769e-07, + "loss": 0.58408272, + "num_input_tokens_seen": 147902870, + "step": 6849, + "time_per_iteration": 2.7884624004364014 + }, + { + "auxiliary_loss_clip": 0.01117115, + "auxiliary_loss_mlp": 0.01084548, + "balance_loss_clip": 1.02513158, + "balance_loss_mlp": 1.00439191, + "epoch": 0.8236638008777731, + "flos": 17092366611840.0, + "grad_norm": 2.9236869644198387, + "language_loss": 0.75410903, + "learning_rate": 3.1726325747102034e-07, + "loss": 0.77612567, + "num_input_tokens_seen": 147921245, + "step": 6850, + "time_per_iteration": 2.776341676712036 + }, + { + "auxiliary_loss_clip": 0.01099516, + "auxiliary_loss_mlp": 0.0108322, + "balance_loss_clip": 1.02304256, + "balance_loss_mlp": 1.00306356, + "epoch": 0.8237840437684122, + "flos": 61639982334720.0, + "grad_norm": 1.4634571579498823, + "language_loss": 0.64110857, + "learning_rate": 3.1684238047698974e-07, + "loss": 0.66293597, + "num_input_tokens_seen": 147949515, + "step": 6851, + "time_per_iteration": 4.065609455108643 + }, + { + "auxiliary_loss_clip": 0.01114012, + "auxiliary_loss_mlp": 0.01084635, + "balance_loss_clip": 1.02256727, + "balance_loss_mlp": 1.00443101, + "epoch": 0.8239042866590512, + "flos": 27309023821440.0, + "grad_norm": 4.028269691991051, + "language_loss": 0.53351867, + "learning_rate": 3.1642175881663155e-07, + "loss": 0.55550516, + "num_input_tokens_seen": 147969245, + "step": 6852, + "time_per_iteration": 3.721287250518799 + }, + { + "auxiliary_loss_clip": 0.01134893, + "auxiliary_loss_mlp": 0.01083799, + "balance_loss_clip": 1.02533889, + "balance_loss_mlp": 1.00364292, + "epoch": 0.8240245295496904, + "flos": 21726351187200.0, + "grad_norm": 1.9868456479393843, + "language_loss": 0.83809197, + "learning_rate": 3.160013925537537e-07, + "loss": 0.86027884, + "num_input_tokens_seen": 147990080, + "step": 6853, + "time_per_iteration": 2.6595120429992676 + }, + { + "auxiliary_loss_clip": 0.01090322, + "auxiliary_loss_mlp": 0.01084464, + "balance_loss_clip": 1.02395189, + "balance_loss_mlp": 1.00435495, + "epoch": 0.8241447724403295, + "flos": 20009318279040.0, + "grad_norm": 1.7885672865142064, + "language_loss": 0.75747567, + "learning_rate": 3.155812817521266e-07, + "loss": 0.77922356, + "num_input_tokens_seen": 148010455, + "step": 6854, + "time_per_iteration": 2.7769055366516113 + }, + { + "auxiliary_loss_clip": 0.0111618, + "auxiliary_loss_mlp": 0.01084273, + "balance_loss_clip": 1.02435422, + "balance_loss_mlp": 1.00411642, + "epoch": 0.8242650153309685, + "flos": 22272983337600.0, + "grad_norm": 2.207876304331153, + "language_loss": 0.78077269, + "learning_rate": 3.151614264754787e-07, + "loss": 0.80277717, + "num_input_tokens_seen": 148028400, + "step": 6855, + "time_per_iteration": 3.7021491527557373 + }, + { + "auxiliary_loss_clip": 0.01134857, + "auxiliary_loss_mlp": 0.01085151, + "balance_loss_clip": 1.02490425, + "balance_loss_mlp": 1.00489914, + "epoch": 0.8243852582216077, + "flos": 22309971367680.0, + "grad_norm": 1.9706247696285242, + "language_loss": 0.78915071, + "learning_rate": 3.147418267875035e-07, + "loss": 0.81135076, + "num_input_tokens_seen": 148046530, + "step": 6856, + "time_per_iteration": 2.6513683795928955 + }, + { + "auxiliary_loss_clip": 0.01088349, + "auxiliary_loss_mlp": 0.00872841, + "balance_loss_clip": 1.02139735, + "balance_loss_mlp": 1.00012922, + "epoch": 0.8245055011122467, + "flos": 24645421756800.0, + "grad_norm": 2.015600509616438, + "language_loss": 0.65346426, + "learning_rate": 3.1432248275185315e-07, + "loss": 0.67307615, + "num_input_tokens_seen": 148067040, + "step": 6857, + "time_per_iteration": 2.8998067378997803 + }, + { + "auxiliary_loss_clip": 0.01126318, + "auxiliary_loss_mlp": 0.01084313, + "balance_loss_clip": 1.02576125, + "balance_loss_mlp": 1.00415683, + "epoch": 0.8246257440028858, + "flos": 17487275713920.0, + "grad_norm": 1.9567359102182245, + "language_loss": 0.77127683, + "learning_rate": 3.139033944321412e-07, + "loss": 0.79338318, + "num_input_tokens_seen": 148084400, + "step": 6858, + "time_per_iteration": 3.5694048404693604 + }, + { + "auxiliary_loss_clip": 0.01125035, + "auxiliary_loss_mlp": 0.01084499, + "balance_loss_clip": 1.0239507, + "balance_loss_mlp": 1.00434279, + "epoch": 0.824745986893525, + "flos": 25010130499200.0, + "grad_norm": 3.289048670155211, + "language_loss": 0.7895236, + "learning_rate": 3.1348456189194507e-07, + "loss": 0.81161898, + "num_input_tokens_seen": 148104860, + "step": 6859, + "time_per_iteration": 2.7646453380584717 + }, + { + "auxiliary_loss_clip": 0.01109339, + "auxiliary_loss_mlp": 0.01084053, + "balance_loss_clip": 1.02419257, + "balance_loss_mlp": 1.0039916, + "epoch": 0.824866229784164, + "flos": 18772698798720.0, + "grad_norm": 1.7661532619593932, + "language_loss": 0.83017409, + "learning_rate": 3.1306598519479876e-07, + "loss": 0.852108, + "num_input_tokens_seen": 148124680, + "step": 6860, + "time_per_iteration": 2.835845708847046 + }, + { + "auxiliary_loss_clip": 0.01110827, + "auxiliary_loss_mlp": 0.01083432, + "balance_loss_clip": 1.02505934, + "balance_loss_mlp": 1.00337124, + "epoch": 0.8249864726748031, + "flos": 23842171866240.0, + "grad_norm": 2.0410270020177474, + "language_loss": 0.78164005, + "learning_rate": 3.1264766440420177e-07, + "loss": 0.80358261, + "num_input_tokens_seen": 148147150, + "step": 6861, + "time_per_iteration": 2.813650369644165 + }, + { + "auxiliary_loss_clip": 0.01123532, + "auxiliary_loss_mlp": 0.01083503, + "balance_loss_clip": 1.02405572, + "balance_loss_mlp": 1.00339413, + "epoch": 0.8251067155654422, + "flos": 20303103617280.0, + "grad_norm": 1.8270429541962325, + "language_loss": 0.68992245, + "learning_rate": 3.122295995836124e-07, + "loss": 0.71199274, + "num_input_tokens_seen": 148167020, + "step": 6862, + "time_per_iteration": 2.6747262477874756 + }, + { + "auxiliary_loss_clip": 0.01125595, + "auxiliary_loss_mlp": 0.01084862, + "balance_loss_clip": 1.02419078, + "balance_loss_mlp": 1.00465786, + "epoch": 0.8252269584560813, + "flos": 25009699536000.0, + "grad_norm": 2.018049435550214, + "language_loss": 0.77217734, + "learning_rate": 3.118117907964508e-07, + "loss": 0.79428196, + "num_input_tokens_seen": 148188965, + "step": 6863, + "time_per_iteration": 2.772463798522949 + }, + { + "auxiliary_loss_clip": 0.01108999, + "auxiliary_loss_mlp": 0.01083685, + "balance_loss_clip": 1.02453709, + "balance_loss_mlp": 1.00367188, + "epoch": 0.8253472013467203, + "flos": 17128564542720.0, + "grad_norm": 1.8836932862466556, + "language_loss": 0.80068374, + "learning_rate": 3.1139423810609856e-07, + "loss": 0.8226105, + "num_input_tokens_seen": 148205660, + "step": 6864, + "time_per_iteration": 2.7375600337982178 + }, + { + "auxiliary_loss_clip": 0.01134983, + "auxiliary_loss_mlp": 0.01084164, + "balance_loss_clip": 1.02532017, + "balance_loss_mlp": 1.00396025, + "epoch": 0.8254674442373595, + "flos": 22414794232320.0, + "grad_norm": 1.8031515091976158, + "language_loss": 0.75300503, + "learning_rate": 3.1097694157589714e-07, + "loss": 0.77519649, + "num_input_tokens_seen": 148225545, + "step": 6865, + "time_per_iteration": 2.6705527305603027 + }, + { + "auxiliary_loss_clip": 0.0112558, + "auxiliary_loss_mlp": 0.01084158, + "balance_loss_clip": 1.02630711, + "balance_loss_mlp": 1.00404906, + "epoch": 0.8255876871279986, + "flos": 24786765774720.0, + "grad_norm": 4.957341140501696, + "language_loss": 0.76727033, + "learning_rate": 3.105599012691511e-07, + "loss": 0.78936774, + "num_input_tokens_seen": 148243975, + "step": 6866, + "time_per_iteration": 2.6346569061279297 + }, + { + "auxiliary_loss_clip": 0.0112364, + "auxiliary_loss_mlp": 0.0108407, + "balance_loss_clip": 1.02411687, + "balance_loss_mlp": 1.00396156, + "epoch": 0.8257079300186376, + "flos": 27455431656960.0, + "grad_norm": 1.4374567535035314, + "language_loss": 0.82404172, + "learning_rate": 3.101431172491249e-07, + "loss": 0.84611881, + "num_input_tokens_seen": 148265520, + "step": 6867, + "time_per_iteration": 2.7588963508605957 + }, + { + "auxiliary_loss_clip": 0.01107308, + "auxiliary_loss_mlp": 0.00872961, + "balance_loss_clip": 1.023085, + "balance_loss_mlp": 1.00006008, + "epoch": 0.8258281729092768, + "flos": 16471866142080.0, + "grad_norm": 2.11995710013594, + "language_loss": 0.71666211, + "learning_rate": 3.097265895790444e-07, + "loss": 0.73646474, + "num_input_tokens_seen": 148283730, + "step": 6868, + "time_per_iteration": 2.7545325756073 + }, + { + "auxiliary_loss_clip": 0.01109154, + "auxiliary_loss_mlp": 0.01084409, + "balance_loss_clip": 1.02533698, + "balance_loss_mlp": 1.00425279, + "epoch": 0.8259484157999158, + "flos": 21433822824960.0, + "grad_norm": 1.993345578853021, + "language_loss": 0.83066893, + "learning_rate": 3.093103183220962e-07, + "loss": 0.85260457, + "num_input_tokens_seen": 148303775, + "step": 6869, + "time_per_iteration": 2.7615714073181152 + }, + { + "auxiliary_loss_clip": 0.01104977, + "auxiliary_loss_mlp": 0.01078958, + "balance_loss_clip": 1.01791453, + "balance_loss_mlp": 0.99999398, + "epoch": 0.8260686586905549, + "flos": 58322342453760.0, + "grad_norm": 0.8161211508703694, + "language_loss": 0.59341568, + "learning_rate": 3.0889430354142796e-07, + "loss": 0.61525506, + "num_input_tokens_seen": 148365285, + "step": 6870, + "time_per_iteration": 3.2603843212127686 + }, + { + "auxiliary_loss_clip": 0.01107249, + "auxiliary_loss_mlp": 0.01084216, + "balance_loss_clip": 1.02378774, + "balance_loss_mlp": 1.00401187, + "epoch": 0.826188901581194, + "flos": 27527288814720.0, + "grad_norm": 1.7262233894490462, + "language_loss": 0.70042074, + "learning_rate": 3.084785453001497e-07, + "loss": 0.72233534, + "num_input_tokens_seen": 148386200, + "step": 6871, + "time_per_iteration": 2.7894861698150635 + }, + { + "auxiliary_loss_clip": 0.01116495, + "auxiliary_loss_mlp": 0.00872869, + "balance_loss_clip": 1.02470338, + "balance_loss_mlp": 1.0000999, + "epoch": 0.8263091444718331, + "flos": 23696051339520.0, + "grad_norm": 1.9972528952762407, + "language_loss": 0.81998801, + "learning_rate": 3.080630436613314e-07, + "loss": 0.8398816, + "num_input_tokens_seen": 148403970, + "step": 6872, + "time_per_iteration": 2.7328386306762695 + }, + { + "auxiliary_loss_clip": 0.01126103, + "auxiliary_loss_mlp": 0.01085808, + "balance_loss_clip": 1.02489471, + "balance_loss_mlp": 1.00565147, + "epoch": 0.8264293873624722, + "flos": 17165157523200.0, + "grad_norm": 2.0169443151386615, + "language_loss": 0.85547853, + "learning_rate": 3.076477986880039e-07, + "loss": 0.87759757, + "num_input_tokens_seen": 148421765, + "step": 6873, + "time_per_iteration": 2.6385762691497803 + }, + { + "auxiliary_loss_clip": 0.01115685, + "auxiliary_loss_mlp": 0.01084844, + "balance_loss_clip": 1.0247165, + "balance_loss_mlp": 1.00464034, + "epoch": 0.8265496302531112, + "flos": 24098645952000.0, + "grad_norm": 2.337000363500318, + "language_loss": 0.69311029, + "learning_rate": 3.0723281044315986e-07, + "loss": 0.71511555, + "num_input_tokens_seen": 148443720, + "step": 6874, + "time_per_iteration": 2.747650623321533 + }, + { + "auxiliary_loss_clip": 0.01133902, + "auxiliary_loss_mlp": 0.010841, + "balance_loss_clip": 1.02492261, + "balance_loss_mlp": 1.00408685, + "epoch": 0.8266698731437504, + "flos": 14099894599680.0, + "grad_norm": 2.054138731778396, + "language_loss": 0.76415402, + "learning_rate": 3.068180789897521e-07, + "loss": 0.78633404, + "num_input_tokens_seen": 148462130, + "step": 6875, + "time_per_iteration": 2.6240522861480713 + }, + { + "auxiliary_loss_clip": 0.01126362, + "auxiliary_loss_mlp": 0.01085023, + "balance_loss_clip": 1.02507007, + "balance_loss_mlp": 1.00486708, + "epoch": 0.8267901160343895, + "flos": 30777563715840.0, + "grad_norm": 1.403528046102571, + "language_loss": 0.81428599, + "learning_rate": 3.064036043906966e-07, + "loss": 0.83639991, + "num_input_tokens_seen": 148485570, + "step": 6876, + "time_per_iteration": 2.7728021144866943 + }, + { + "auxiliary_loss_clip": 0.01108074, + "auxiliary_loss_mlp": 0.01084628, + "balance_loss_clip": 1.02382767, + "balance_loss_mlp": 1.00442374, + "epoch": 0.8269103589250285, + "flos": 40624915242240.0, + "grad_norm": 2.361192845329686, + "language_loss": 0.68246436, + "learning_rate": 3.059893867088668e-07, + "loss": 0.70439136, + "num_input_tokens_seen": 148509715, + "step": 6877, + "time_per_iteration": 4.716672658920288 + }, + { + "auxiliary_loss_clip": 0.01124973, + "auxiliary_loss_mlp": 0.0108497, + "balance_loss_clip": 1.02472687, + "balance_loss_mlp": 1.00481403, + "epoch": 0.8270306018156677, + "flos": 30263645877120.0, + "grad_norm": 1.8735121010002629, + "language_loss": 0.67185128, + "learning_rate": 3.055754260071004e-07, + "loss": 0.69395065, + "num_input_tokens_seen": 148532010, + "step": 6878, + "time_per_iteration": 2.762773275375366 + }, + { + "auxiliary_loss_clip": 0.01125331, + "auxiliary_loss_mlp": 0.01084889, + "balance_loss_clip": 1.02456582, + "balance_loss_mlp": 1.00478029, + "epoch": 0.8271508447063067, + "flos": 25226599812480.0, + "grad_norm": 1.8178029581281683, + "language_loss": 0.73522115, + "learning_rate": 3.051617223481948e-07, + "loss": 0.75732338, + "num_input_tokens_seen": 148553330, + "step": 6879, + "time_per_iteration": 2.7273917198181152 + }, + { + "auxiliary_loss_clip": 0.010949, + "auxiliary_loss_mlp": 0.01084295, + "balance_loss_clip": 1.02620101, + "balance_loss_mlp": 1.00413871, + "epoch": 0.8272710875969458, + "flos": 17566602900480.0, + "grad_norm": 1.9579897881312882, + "language_loss": 0.75280404, + "learning_rate": 3.047482757949078e-07, + "loss": 0.77459592, + "num_input_tokens_seen": 148570960, + "step": 6880, + "time_per_iteration": 3.6581037044525146 + }, + { + "auxiliary_loss_clip": 0.01106007, + "auxiliary_loss_mlp": 0.00872861, + "balance_loss_clip": 1.02309084, + "balance_loss_mlp": 1.00013793, + "epoch": 0.827391330487585, + "flos": 19755465886080.0, + "grad_norm": 3.862386237125562, + "language_loss": 0.85585713, + "learning_rate": 3.043350864099605e-07, + "loss": 0.87564588, + "num_input_tokens_seen": 148589520, + "step": 6881, + "time_per_iteration": 2.759758234024048 + }, + { + "auxiliary_loss_clip": 0.01127135, + "auxiliary_loss_mlp": 0.01083426, + "balance_loss_clip": 1.02554774, + "balance_loss_mlp": 1.00331712, + "epoch": 0.827511573378224, + "flos": 16835174254080.0, + "grad_norm": 2.357344955016907, + "language_loss": 0.80930847, + "learning_rate": 3.039221542560315e-07, + "loss": 0.83141404, + "num_input_tokens_seen": 148606085, + "step": 6882, + "time_per_iteration": 2.662907838821411 + }, + { + "auxiliary_loss_clip": 0.01124234, + "auxiliary_loss_mlp": 0.01082753, + "balance_loss_clip": 1.02477801, + "balance_loss_mlp": 1.00269175, + "epoch": 0.8276318162688631, + "flos": 18369242259840.0, + "grad_norm": 1.8421178175626731, + "language_loss": 0.7378512, + "learning_rate": 3.0350947939576356e-07, + "loss": 0.75992107, + "num_input_tokens_seen": 148625240, + "step": 6883, + "time_per_iteration": 2.659902572631836 + }, + { + "auxiliary_loss_clip": 0.01128718, + "auxiliary_loss_mlp": 0.01083155, + "balance_loss_clip": 1.02678084, + "balance_loss_mlp": 1.00295091, + "epoch": 0.8277520591595022, + "flos": 19352691705600.0, + "grad_norm": 1.9382532156833612, + "language_loss": 0.72332644, + "learning_rate": 3.0309706189175876e-07, + "loss": 0.74544513, + "num_input_tokens_seen": 148645075, + "step": 6884, + "time_per_iteration": 3.5281527042388916 + }, + { + "auxiliary_loss_clip": 0.010978, + "auxiliary_loss_mlp": 0.01079051, + "balance_loss_clip": 1.01855278, + "balance_loss_mlp": 1.00008702, + "epoch": 0.8278723020501413, + "flos": 67918858329600.0, + "grad_norm": 0.7609557391608958, + "language_loss": 0.57425427, + "learning_rate": 3.0268490180658045e-07, + "loss": 0.59602273, + "num_input_tokens_seen": 148707855, + "step": 6885, + "time_per_iteration": 3.3319308757781982 + }, + { + "auxiliary_loss_clip": 0.01135872, + "auxiliary_loss_mlp": 0.01083813, + "balance_loss_clip": 1.02610993, + "balance_loss_mlp": 1.00379944, + "epoch": 0.8279925449407803, + "flos": 18185738653440.0, + "grad_norm": 2.340161853982425, + "language_loss": 0.79497439, + "learning_rate": 3.0227299920275305e-07, + "loss": 0.81717122, + "num_input_tokens_seen": 148724170, + "step": 6886, + "time_per_iteration": 2.5707359313964844 + }, + { + "auxiliary_loss_clip": 0.01105435, + "auxiliary_loss_mlp": 0.01084735, + "balance_loss_clip": 1.02369177, + "balance_loss_mlp": 1.00443554, + "epoch": 0.8281127878314195, + "flos": 20631434860800.0, + "grad_norm": 2.4518263015116153, + "language_loss": 0.85486555, + "learning_rate": 3.018613541427613e-07, + "loss": 0.87676728, + "num_input_tokens_seen": 148743690, + "step": 6887, + "time_per_iteration": 2.7451367378234863 + }, + { + "auxiliary_loss_clip": 0.01134659, + "auxiliary_loss_mlp": 0.0108313, + "balance_loss_clip": 1.02517056, + "balance_loss_mlp": 1.00306892, + "epoch": 0.8282330307220586, + "flos": 18004282122240.0, + "grad_norm": 1.6003763050042448, + "language_loss": 0.73792052, + "learning_rate": 3.0144996668905243e-07, + "loss": 0.76009846, + "num_input_tokens_seen": 148761070, + "step": 6888, + "time_per_iteration": 2.5871901512145996 + }, + { + "auxiliary_loss_clip": 0.01091743, + "auxiliary_loss_mlp": 0.00872903, + "balance_loss_clip": 1.02378738, + "balance_loss_mlp": 1.00010753, + "epoch": 0.8283532736126976, + "flos": 20084120352000.0, + "grad_norm": 1.9643138471184982, + "language_loss": 0.8210032, + "learning_rate": 3.010388369040331e-07, + "loss": 0.84064966, + "num_input_tokens_seen": 148779730, + "step": 6889, + "time_per_iteration": 2.876007318496704 + }, + { + "auxiliary_loss_clip": 0.0112426, + "auxiliary_loss_mlp": 0.01083564, + "balance_loss_clip": 1.02399921, + "balance_loss_mlp": 1.00345564, + "epoch": 0.8284735165033368, + "flos": 31868421805440.0, + "grad_norm": 1.6602812565294744, + "language_loss": 0.82636642, + "learning_rate": 3.0062796485007156e-07, + "loss": 0.8484447, + "num_input_tokens_seen": 148800670, + "step": 6890, + "time_per_iteration": 2.7483913898468018 + }, + { + "auxiliary_loss_clip": 0.01135735, + "auxiliary_loss_mlp": 0.0087286, + "balance_loss_clip": 1.02622366, + "balance_loss_mlp": 1.00012636, + "epoch": 0.8285937593939758, + "flos": 26651319840000.0, + "grad_norm": 6.213897002716884, + "language_loss": 0.65300357, + "learning_rate": 3.002173505894965e-07, + "loss": 0.6730895, + "num_input_tokens_seen": 148819820, + "step": 6891, + "time_per_iteration": 2.660088062286377 + }, + { + "auxiliary_loss_clip": 0.0112718, + "auxiliary_loss_mlp": 0.01083553, + "balance_loss_clip": 1.02508497, + "balance_loss_mlp": 1.00330114, + "epoch": 0.8287140022846149, + "flos": 20193683811840.0, + "grad_norm": 2.7112276991149202, + "language_loss": 0.62415487, + "learning_rate": 2.998069941845973e-07, + "loss": 0.64626223, + "num_input_tokens_seen": 148838890, + "step": 6892, + "time_per_iteration": 2.6998705863952637 + }, + { + "auxiliary_loss_clip": 0.01113451, + "auxiliary_loss_mlp": 0.01078857, + "balance_loss_clip": 1.01828814, + "balance_loss_mlp": 0.99989265, + "epoch": 0.8288342451752541, + "flos": 70755980019840.0, + "grad_norm": 0.7074915175599389, + "language_loss": 0.57527995, + "learning_rate": 2.993968956976258e-07, + "loss": 0.59720302, + "num_input_tokens_seen": 148906635, + "step": 6893, + "time_per_iteration": 3.3745028972625732 + }, + { + "auxiliary_loss_clip": 0.01136447, + "auxiliary_loss_mlp": 0.01084572, + "balance_loss_clip": 1.02645254, + "balance_loss_mlp": 1.00436759, + "epoch": 0.8289544880658931, + "flos": 24572235795840.0, + "grad_norm": 1.7358995165040234, + "language_loss": 0.70141733, + "learning_rate": 2.9898705519079313e-07, + "loss": 0.72362757, + "num_input_tokens_seen": 148925740, + "step": 6894, + "time_per_iteration": 2.696096420288086 + }, + { + "auxiliary_loss_clip": 0.0111545, + "auxiliary_loss_mlp": 0.01084088, + "balance_loss_clip": 1.02417755, + "balance_loss_mlp": 1.00397897, + "epoch": 0.8290747309565322, + "flos": 22273378387200.0, + "grad_norm": 1.8677332840504683, + "language_loss": 0.75024432, + "learning_rate": 2.985774727262715e-07, + "loss": 0.77223969, + "num_input_tokens_seen": 148944585, + "step": 6895, + "time_per_iteration": 2.6689960956573486 + }, + { + "auxiliary_loss_clip": 0.01135374, + "auxiliary_loss_mlp": 0.01083592, + "balance_loss_clip": 1.02574813, + "balance_loss_mlp": 1.00357866, + "epoch": 0.8291949738471713, + "flos": 23255570856960.0, + "grad_norm": 1.6483429054235814, + "language_loss": 0.81687087, + "learning_rate": 2.981681483661949e-07, + "loss": 0.83906054, + "num_input_tokens_seen": 148964170, + "step": 6896, + "time_per_iteration": 2.6610798835754395 + }, + { + "auxiliary_loss_clip": 0.01125873, + "auxiliary_loss_mlp": 0.01084795, + "balance_loss_clip": 1.02579951, + "balance_loss_mlp": 1.0047344, + "epoch": 0.8293152167378104, + "flos": 52555768185600.0, + "grad_norm": 1.5979771420422624, + "language_loss": 0.70765072, + "learning_rate": 2.9775908217265633e-07, + "loss": 0.72975743, + "num_input_tokens_seen": 148989405, + "step": 6897, + "time_per_iteration": 2.9121620655059814 + }, + { + "auxiliary_loss_clip": 0.01073836, + "auxiliary_loss_mlp": 0.01079527, + "balance_loss_clip": 1.02044594, + "balance_loss_mlp": 1.00056231, + "epoch": 0.8294354596284494, + "flos": 63356156294400.0, + "grad_norm": 0.8316970222812626, + "language_loss": 0.50339711, + "learning_rate": 2.9735027420771253e-07, + "loss": 0.52493072, + "num_input_tokens_seen": 149049740, + "step": 6898, + "time_per_iteration": 3.358311891555786 + }, + { + "auxiliary_loss_clip": 0.01116286, + "auxiliary_loss_mlp": 0.01083445, + "balance_loss_clip": 1.02561545, + "balance_loss_mlp": 1.00338364, + "epoch": 0.8295557025190886, + "flos": 24827021942400.0, + "grad_norm": 1.6368152483780394, + "language_loss": 0.71431756, + "learning_rate": 2.969417245333774e-07, + "loss": 0.73631495, + "num_input_tokens_seen": 149069120, + "step": 6899, + "time_per_iteration": 2.756394147872925 + }, + { + "auxiliary_loss_clip": 0.0110604, + "auxiliary_loss_mlp": 0.01083237, + "balance_loss_clip": 1.02403426, + "balance_loss_mlp": 1.00317621, + "epoch": 0.8296759454097277, + "flos": 25118580637440.0, + "grad_norm": 2.0790080612463053, + "language_loss": 0.78378022, + "learning_rate": 2.9653343321162915e-07, + "loss": 0.805673, + "num_input_tokens_seen": 149088630, + "step": 6900, + "time_per_iteration": 2.7410900592803955 + }, + { + "auxiliary_loss_clip": 0.01107166, + "auxiliary_loss_mlp": 0.01085582, + "balance_loss_clip": 1.02492106, + "balance_loss_mlp": 1.00542533, + "epoch": 0.8297961883003667, + "flos": 24132581326080.0, + "grad_norm": 1.817235312057855, + "language_loss": 0.65142828, + "learning_rate": 2.9612540030440446e-07, + "loss": 0.67335576, + "num_input_tokens_seen": 149109175, + "step": 6901, + "time_per_iteration": 2.7821333408355713 + }, + { + "auxiliary_loss_clip": 0.01095999, + "auxiliary_loss_mlp": 0.01078953, + "balance_loss_clip": 1.01768351, + "balance_loss_mlp": 0.99998903, + "epoch": 0.8299164311910058, + "flos": 67446561375360.0, + "grad_norm": 0.8495375147793038, + "language_loss": 0.6412918, + "learning_rate": 2.9571762587360206e-07, + "loss": 0.66304129, + "num_input_tokens_seen": 149165560, + "step": 6902, + "time_per_iteration": 4.089302062988281 + }, + { + "auxiliary_loss_clip": 0.01100397, + "auxiliary_loss_mlp": 0.01083765, + "balance_loss_clip": 1.02336383, + "balance_loss_mlp": 1.00360906, + "epoch": 0.8300366740816449, + "flos": 25228682801280.0, + "grad_norm": 1.4754026099323525, + "language_loss": 0.74206686, + "learning_rate": 2.953101099810806e-07, + "loss": 0.76390851, + "num_input_tokens_seen": 149185165, + "step": 6903, + "time_per_iteration": 3.855734348297119 + }, + { + "auxiliary_loss_clip": 0.01118397, + "auxiliary_loss_mlp": 0.01083632, + "balance_loss_clip": 1.02454495, + "balance_loss_mlp": 1.0035708, + "epoch": 0.830156916972284, + "flos": 18041018757120.0, + "grad_norm": 1.871352880781536, + "language_loss": 0.82440734, + "learning_rate": 2.9490285268865965e-07, + "loss": 0.84642768, + "num_input_tokens_seen": 149202655, + "step": 6904, + "time_per_iteration": 2.6532809734344482 + }, + { + "auxiliary_loss_clip": 0.01127449, + "auxiliary_loss_mlp": 0.01084664, + "balance_loss_clip": 1.02669108, + "balance_loss_mlp": 1.00436497, + "epoch": 0.830277159862923, + "flos": 26322485806080.0, + "grad_norm": 1.9505350443147793, + "language_loss": 0.7945205, + "learning_rate": 2.9449585405812085e-07, + "loss": 0.81664163, + "num_input_tokens_seen": 149220035, + "step": 6905, + "time_per_iteration": 2.710186004638672 + }, + { + "auxiliary_loss_clip": 0.01089318, + "auxiliary_loss_mlp": 0.01083857, + "balance_loss_clip": 1.0229497, + "balance_loss_mlp": 1.00374866, + "epoch": 0.8303974027535622, + "flos": 19938861751680.0, + "grad_norm": 6.997089862583248, + "language_loss": 0.73766303, + "learning_rate": 2.940891141512043e-07, + "loss": 0.75939476, + "num_input_tokens_seen": 149238055, + "step": 6906, + "time_per_iteration": 3.7239298820495605 + }, + { + "auxiliary_loss_clip": 0.01118647, + "auxiliary_loss_mlp": 0.01084503, + "balance_loss_clip": 1.02526999, + "balance_loss_mlp": 1.0042994, + "epoch": 0.8305176456442013, + "flos": 17165552572800.0, + "grad_norm": 1.9671132002800662, + "language_loss": 0.71615756, + "learning_rate": 2.9368263302961385e-07, + "loss": 0.7381891, + "num_input_tokens_seen": 149256755, + "step": 6907, + "time_per_iteration": 2.6937458515167236 + }, + { + "auxiliary_loss_clip": 0.010866, + "auxiliary_loss_mlp": 0.01082915, + "balance_loss_clip": 1.02187467, + "balance_loss_mlp": 1.00275886, + "epoch": 0.8306378885348403, + "flos": 25627614226560.0, + "grad_norm": 1.7891859980245155, + "language_loss": 0.7989139, + "learning_rate": 2.9327641075501075e-07, + "loss": 0.82060903, + "num_input_tokens_seen": 149275745, + "step": 6908, + "time_per_iteration": 2.816917896270752 + }, + { + "auxiliary_loss_clip": 0.01118258, + "auxiliary_loss_mlp": 0.01084599, + "balance_loss_clip": 1.0250355, + "balance_loss_mlp": 1.00444317, + "epoch": 0.8307581314254795, + "flos": 33947864985600.0, + "grad_norm": 2.7341496555803686, + "language_loss": 0.66090059, + "learning_rate": 2.9287044738901866e-07, + "loss": 0.68292916, + "num_input_tokens_seen": 149293730, + "step": 6909, + "time_per_iteration": 2.837174415588379 + }, + { + "auxiliary_loss_clip": 0.01126583, + "auxiliary_loss_mlp": 0.00872809, + "balance_loss_clip": 1.02560282, + "balance_loss_mlp": 1.00012648, + "epoch": 0.8308783743161186, + "flos": 17562724231680.0, + "grad_norm": 1.9303294639772846, + "language_loss": 0.90741599, + "learning_rate": 2.9246474299322274e-07, + "loss": 0.92740989, + "num_input_tokens_seen": 149309290, + "step": 6910, + "time_per_iteration": 3.5413670539855957 + }, + { + "auxiliary_loss_clip": 0.01074192, + "auxiliary_loss_mlp": 0.01078856, + "balance_loss_clip": 1.02085328, + "balance_loss_mlp": 0.99989146, + "epoch": 0.8309986172067576, + "flos": 69412885649280.0, + "grad_norm": 0.883271228511641, + "language_loss": 0.63173622, + "learning_rate": 2.920592976291678e-07, + "loss": 0.65326667, + "num_input_tokens_seen": 149366620, + "step": 6911, + "time_per_iteration": 3.2851462364196777 + }, + { + "auxiliary_loss_clip": 0.01126588, + "auxiliary_loss_mlp": 0.01084994, + "balance_loss_clip": 1.02546334, + "balance_loss_mlp": 1.00488496, + "epoch": 0.8311188600973968, + "flos": 22309755886080.0, + "grad_norm": 2.406214790439411, + "language_loss": 0.80424142, + "learning_rate": 2.916541113583595e-07, + "loss": 0.82635725, + "num_input_tokens_seen": 149385120, + "step": 6912, + "time_per_iteration": 2.7258167266845703 + }, + { + "auxiliary_loss_clip": 0.0109193, + "auxiliary_loss_mlp": 0.01083093, + "balance_loss_clip": 1.02648616, + "balance_loss_mlp": 1.00298429, + "epoch": 0.8312391029880358, + "flos": 18770077105920.0, + "grad_norm": 2.131774664811683, + "language_loss": 0.66313463, + "learning_rate": 2.912491842422642e-07, + "loss": 0.68488485, + "num_input_tokens_seen": 149402825, + "step": 6913, + "time_per_iteration": 2.679232120513916 + }, + { + "auxiliary_loss_clip": 0.01128844, + "auxiliary_loss_mlp": 0.010852, + "balance_loss_clip": 1.02727389, + "balance_loss_mlp": 1.00509119, + "epoch": 0.8313593458786749, + "flos": 20376648714240.0, + "grad_norm": 1.6881753763012441, + "language_loss": 0.70868397, + "learning_rate": 2.9084451634230857e-07, + "loss": 0.73082447, + "num_input_tokens_seen": 149422125, + "step": 6914, + "time_per_iteration": 2.692349910736084 + }, + { + "auxiliary_loss_clip": 0.01105948, + "auxiliary_loss_mlp": 0.01083364, + "balance_loss_clip": 1.02351737, + "balance_loss_mlp": 1.00320768, + "epoch": 0.831479588769314, + "flos": 32124069878400.0, + "grad_norm": 2.4314604566100066, + "language_loss": 0.71240032, + "learning_rate": 2.9044010771988125e-07, + "loss": 0.73429346, + "num_input_tokens_seen": 149441940, + "step": 6915, + "time_per_iteration": 2.8400585651397705 + }, + { + "auxiliary_loss_clip": 0.01117266, + "auxiliary_loss_mlp": 0.01085258, + "balance_loss_clip": 1.02503037, + "balance_loss_mlp": 1.00514984, + "epoch": 0.8315998316599531, + "flos": 45185929338240.0, + "grad_norm": 1.6440200262639912, + "language_loss": 0.72221446, + "learning_rate": 2.900359584363303e-07, + "loss": 0.74423975, + "num_input_tokens_seen": 149465045, + "step": 6916, + "time_per_iteration": 2.932835817337036 + }, + { + "auxiliary_loss_clip": 0.01093408, + "auxiliary_loss_mlp": 0.01083886, + "balance_loss_clip": 1.02453876, + "balance_loss_mlp": 1.0036819, + "epoch": 0.8317200745505922, + "flos": 18363747479040.0, + "grad_norm": 2.1824399250462907, + "language_loss": 0.84238988, + "learning_rate": 2.8963206855296494e-07, + "loss": 0.8641628, + "num_input_tokens_seen": 149481285, + "step": 6917, + "time_per_iteration": 2.8060474395751953 + }, + { + "auxiliary_loss_clip": 0.01126048, + "auxiliary_loss_mlp": 0.01083421, + "balance_loss_clip": 1.02523184, + "balance_loss_mlp": 1.00331283, + "epoch": 0.8318403174412313, + "flos": 24206557386240.0, + "grad_norm": 1.5854412781861331, + "language_loss": 0.76976246, + "learning_rate": 2.892284381310548e-07, + "loss": 0.79185718, + "num_input_tokens_seen": 149502700, + "step": 6918, + "time_per_iteration": 2.6248278617858887 + }, + { + "auxiliary_loss_clip": 0.01107825, + "auxiliary_loss_mlp": 0.01083603, + "balance_loss_clip": 1.02289045, + "balance_loss_mlp": 1.00339937, + "epoch": 0.8319605603318704, + "flos": 22418780641920.0, + "grad_norm": 2.4279112340308107, + "language_loss": 0.72245783, + "learning_rate": 2.888250672318302e-07, + "loss": 0.74437213, + "num_input_tokens_seen": 149520100, + "step": 6919, + "time_per_iteration": 2.741379737854004 + }, + { + "auxiliary_loss_clip": 0.01137192, + "auxiliary_loss_mlp": 0.01085296, + "balance_loss_clip": 1.02752781, + "balance_loss_mlp": 1.00514007, + "epoch": 0.8320808032225094, + "flos": 37414501459200.0, + "grad_norm": 1.500139027325814, + "language_loss": 0.68521953, + "learning_rate": 2.884219559164831e-07, + "loss": 0.70744443, + "num_input_tokens_seen": 149543245, + "step": 6920, + "time_per_iteration": 2.7386586666107178 + }, + { + "auxiliary_loss_clip": 0.01124443, + "auxiliary_loss_mlp": 0.01083299, + "balance_loss_clip": 1.02478838, + "balance_loss_mlp": 1.00314283, + "epoch": 0.8322010461131486, + "flos": 12787395638400.0, + "grad_norm": 1.9104423537228061, + "language_loss": 0.81260598, + "learning_rate": 2.880191042461635e-07, + "loss": 0.83468336, + "num_input_tokens_seen": 149559185, + "step": 6921, + "time_per_iteration": 2.7717645168304443 + }, + { + "auxiliary_loss_clip": 0.01100451, + "auxiliary_loss_mlp": 0.01085012, + "balance_loss_clip": 1.02456689, + "balance_loss_mlp": 1.00495136, + "epoch": 0.8323212890037877, + "flos": 15815455050240.0, + "grad_norm": 1.6328999957046826, + "language_loss": 0.80293763, + "learning_rate": 2.876165122819849e-07, + "loss": 0.82479227, + "num_input_tokens_seen": 149577165, + "step": 6922, + "time_per_iteration": 2.7684831619262695 + }, + { + "auxiliary_loss_clip": 0.01134224, + "auxiliary_loss_mlp": 0.01083566, + "balance_loss_clip": 1.02529335, + "balance_loss_mlp": 1.00345778, + "epoch": 0.8324415318944267, + "flos": 21719276208000.0, + "grad_norm": 1.521157452695516, + "language_loss": 0.79453146, + "learning_rate": 2.872141800850201e-07, + "loss": 0.8167094, + "num_input_tokens_seen": 149594340, + "step": 6923, + "time_per_iteration": 2.6108524799346924 + }, + { + "auxiliary_loss_clip": 0.01134466, + "auxiliary_loss_mlp": 0.01083627, + "balance_loss_clip": 1.02531874, + "balance_loss_mlp": 1.0035187, + "epoch": 0.8325617747850659, + "flos": 34198700636160.0, + "grad_norm": 1.647286215602168, + "language_loss": 0.73143029, + "learning_rate": 2.868121077163024e-07, + "loss": 0.75361121, + "num_input_tokens_seen": 149613895, + "step": 6924, + "time_per_iteration": 2.7382171154022217 + }, + { + "auxiliary_loss_clip": 0.01126781, + "auxiliary_loss_mlp": 0.0108588, + "balance_loss_clip": 1.02537799, + "balance_loss_mlp": 1.00581956, + "epoch": 0.8326820176757049, + "flos": 18369457741440.0, + "grad_norm": 1.7564334406436937, + "language_loss": 0.72403622, + "learning_rate": 2.864102952368257e-07, + "loss": 0.74616277, + "num_input_tokens_seen": 149631820, + "step": 6925, + "time_per_iteration": 2.6716413497924805 + }, + { + "auxiliary_loss_clip": 0.01098712, + "auxiliary_loss_mlp": 0.01082978, + "balance_loss_clip": 1.02306712, + "balance_loss_mlp": 1.00286984, + "epoch": 0.832802260566344, + "flos": 35991325716480.0, + "grad_norm": 1.366349065615935, + "language_loss": 0.59174901, + "learning_rate": 2.860087427075444e-07, + "loss": 0.61356592, + "num_input_tokens_seen": 149656070, + "step": 6926, + "time_per_iteration": 2.862346649169922 + }, + { + "auxiliary_loss_clip": 0.01118162, + "auxiliary_loss_mlp": 0.01083607, + "balance_loss_clip": 1.02513003, + "balance_loss_mlp": 1.00354648, + "epoch": 0.8329225034569832, + "flos": 14244434928000.0, + "grad_norm": 2.395770835832908, + "language_loss": 0.86186421, + "learning_rate": 2.856074501893744e-07, + "loss": 0.88388187, + "num_input_tokens_seen": 149671270, + "step": 6927, + "time_per_iteration": 2.6952648162841797 + }, + { + "auxiliary_loss_clip": 0.01126529, + "auxiliary_loss_mlp": 0.01084796, + "balance_loss_clip": 1.02591991, + "balance_loss_mlp": 1.00468731, + "epoch": 0.8330427463476222, + "flos": 18077468083200.0, + "grad_norm": 1.9203340641531659, + "language_loss": 0.81686491, + "learning_rate": 2.8520641774319054e-07, + "loss": 0.83897817, + "num_input_tokens_seen": 149689360, + "step": 6928, + "time_per_iteration": 3.6077020168304443 + }, + { + "auxiliary_loss_clip": 0.01117108, + "auxiliary_loss_mlp": 0.01084397, + "balance_loss_clip": 1.02390051, + "balance_loss_mlp": 1.00424027, + "epoch": 0.8331629892382613, + "flos": 18040839189120.0, + "grad_norm": 2.2789770450256377, + "language_loss": 0.76061702, + "learning_rate": 2.848056454298309e-07, + "loss": 0.78263211, + "num_input_tokens_seen": 149706685, + "step": 6929, + "time_per_iteration": 2.6242825984954834 + }, + { + "auxiliary_loss_clip": 0.01114688, + "auxiliary_loss_mlp": 0.01084345, + "balance_loss_clip": 1.02414799, + "balance_loss_mlp": 1.00418901, + "epoch": 0.8332832321289004, + "flos": 17457398576640.0, + "grad_norm": 2.094591417477703, + "language_loss": 0.65308267, + "learning_rate": 2.844051333100905e-07, + "loss": 0.67507303, + "num_input_tokens_seen": 149724230, + "step": 6930, + "time_per_iteration": 2.694448471069336 + }, + { + "auxiliary_loss_clip": 0.01116265, + "auxiliary_loss_mlp": 0.01084793, + "balance_loss_clip": 1.02508044, + "balance_loss_mlp": 1.00473201, + "epoch": 0.8334034750195395, + "flos": 15084852416640.0, + "grad_norm": 1.7711466053563256, + "language_loss": 0.83939934, + "learning_rate": 2.840048814447269e-07, + "loss": 0.8614099, + "num_input_tokens_seen": 149742395, + "step": 6931, + "time_per_iteration": 3.6348395347595215 + }, + { + "auxiliary_loss_clip": 0.01120282, + "auxiliary_loss_mlp": 0.01085156, + "balance_loss_clip": 1.02636385, + "balance_loss_mlp": 1.00504756, + "epoch": 0.8335237179101785, + "flos": 19427170556160.0, + "grad_norm": 2.473001622839171, + "language_loss": 0.74240708, + "learning_rate": 2.836048898944587e-07, + "loss": 0.76446146, + "num_input_tokens_seen": 149760820, + "step": 6932, + "time_per_iteration": 2.710084915161133 + }, + { + "auxiliary_loss_clip": 0.01115794, + "auxiliary_loss_mlp": 0.01083597, + "balance_loss_clip": 1.02378464, + "balance_loss_mlp": 1.00353599, + "epoch": 0.8336439608008177, + "flos": 21762046327680.0, + "grad_norm": 2.157723509319319, + "language_loss": 0.7262969, + "learning_rate": 2.832051587199642e-07, + "loss": 0.74829078, + "num_input_tokens_seen": 149778075, + "step": 6933, + "time_per_iteration": 2.7072858810424805 + }, + { + "auxiliary_loss_clip": 0.011053, + "auxiliary_loss_mlp": 0.01079071, + "balance_loss_clip": 1.01820707, + "balance_loss_mlp": 1.00010633, + "epoch": 0.8337642036914568, + "flos": 59702783990400.0, + "grad_norm": 0.8039311211816841, + "language_loss": 0.57792413, + "learning_rate": 2.828056879818821e-07, + "loss": 0.59976792, + "num_input_tokens_seen": 149837150, + "step": 6934, + "time_per_iteration": 3.1964786052703857 + }, + { + "auxiliary_loss_clip": 0.01109422, + "auxiliary_loss_mlp": 0.01083863, + "balance_loss_clip": 1.02457654, + "balance_loss_mlp": 1.00389743, + "epoch": 0.8338844465820958, + "flos": 27162185022720.0, + "grad_norm": 1.772012617099726, + "language_loss": 0.83459222, + "learning_rate": 2.824064777408117e-07, + "loss": 0.85652506, + "num_input_tokens_seen": 149856940, + "step": 6935, + "time_per_iteration": 3.754775285720825 + }, + { + "auxiliary_loss_clip": 0.01125466, + "auxiliary_loss_mlp": 0.01085016, + "balance_loss_clip": 1.02496195, + "balance_loss_mlp": 1.00481153, + "epoch": 0.8340046894727349, + "flos": 30481264425600.0, + "grad_norm": 1.7520462850352203, + "language_loss": 0.7607041, + "learning_rate": 2.8200752805731263e-07, + "loss": 0.7828089, + "num_input_tokens_seen": 149879930, + "step": 6936, + "time_per_iteration": 2.7314858436584473 + }, + { + "auxiliary_loss_clip": 0.01125264, + "auxiliary_loss_mlp": 0.01084729, + "balance_loss_clip": 1.02553737, + "balance_loss_mlp": 1.00452495, + "epoch": 0.834124932363374, + "flos": 27126166659840.0, + "grad_norm": 1.4647524440722315, + "language_loss": 0.81084901, + "learning_rate": 2.8160883899190625e-07, + "loss": 0.83294898, + "num_input_tokens_seen": 149903200, + "step": 6937, + "time_per_iteration": 2.7943592071533203 + }, + { + "auxiliary_loss_clip": 0.01106954, + "auxiliary_loss_mlp": 0.01083851, + "balance_loss_clip": 1.02416658, + "balance_loss_mlp": 1.00374198, + "epoch": 0.8342451752540131, + "flos": 24569865498240.0, + "grad_norm": 2.5647324587520055, + "language_loss": 0.72871375, + "learning_rate": 2.8121041060507234e-07, + "loss": 0.75062186, + "num_input_tokens_seen": 149922230, + "step": 6938, + "time_per_iteration": 2.7912447452545166 + }, + { + "auxiliary_loss_clip": 0.01127112, + "auxiliary_loss_mlp": 0.01083119, + "balance_loss_clip": 1.02555597, + "balance_loss_mlp": 1.00296307, + "epoch": 0.8343654181446521, + "flos": 26615085995520.0, + "grad_norm": 1.535095537962691, + "language_loss": 0.71423233, + "learning_rate": 2.808122429572528e-07, + "loss": 0.73633462, + "num_input_tokens_seen": 149942435, + "step": 6939, + "time_per_iteration": 2.6703100204467773 + }, + { + "auxiliary_loss_clip": 0.01106956, + "auxiliary_loss_mlp": 0.01083405, + "balance_loss_clip": 1.02283061, + "balance_loss_mlp": 1.00324917, + "epoch": 0.8344856610352913, + "flos": 20777268078720.0, + "grad_norm": 2.4092000749271034, + "language_loss": 0.75639313, + "learning_rate": 2.804143361088489e-07, + "loss": 0.77829677, + "num_input_tokens_seen": 149961615, + "step": 6940, + "time_per_iteration": 2.819854974746704 + }, + { + "auxiliary_loss_clip": 0.01116908, + "auxiliary_loss_mlp": 0.01083482, + "balance_loss_clip": 1.02447391, + "balance_loss_mlp": 1.00332594, + "epoch": 0.8346059039259304, + "flos": 26095960684800.0, + "grad_norm": 2.6178764332844016, + "language_loss": 0.77975678, + "learning_rate": 2.8001669012022277e-07, + "loss": 0.80176067, + "num_input_tokens_seen": 149979585, + "step": 6941, + "time_per_iteration": 2.689046859741211 + }, + { + "auxiliary_loss_clip": 0.01126843, + "auxiliary_loss_mlp": 0.01085015, + "balance_loss_clip": 1.02695918, + "balance_loss_mlp": 1.00490642, + "epoch": 0.8347261468165694, + "flos": 29027708755200.0, + "grad_norm": 1.5829387750902004, + "language_loss": 0.69363457, + "learning_rate": 2.7961930505169795e-07, + "loss": 0.71575314, + "num_input_tokens_seen": 150003830, + "step": 6942, + "time_per_iteration": 2.7895560264587402 + }, + { + "auxiliary_loss_clip": 0.01125147, + "auxiliary_loss_mlp": 0.00872927, + "balance_loss_clip": 1.02490926, + "balance_loss_mlp": 1.00013447, + "epoch": 0.8348463897072086, + "flos": 26396461866240.0, + "grad_norm": 2.058253567357249, + "language_loss": 0.76562822, + "learning_rate": 2.792221809635558e-07, + "loss": 0.78560901, + "num_input_tokens_seen": 150024460, + "step": 6943, + "time_per_iteration": 2.6663942337036133 + }, + { + "auxiliary_loss_clip": 0.01076207, + "auxiliary_loss_mlp": 0.01083968, + "balance_loss_clip": 1.02066207, + "balance_loss_mlp": 1.00381136, + "epoch": 0.8349666325978476, + "flos": 23367720096000.0, + "grad_norm": 2.10783237267728, + "language_loss": 0.75057018, + "learning_rate": 2.788253179160411e-07, + "loss": 0.77217185, + "num_input_tokens_seen": 150045620, + "step": 6944, + "time_per_iteration": 2.933046817779541 + }, + { + "auxiliary_loss_clip": 0.01116002, + "auxiliary_loss_mlp": 0.01084112, + "balance_loss_clip": 1.02464819, + "balance_loss_mlp": 1.00395584, + "epoch": 0.8350868754884867, + "flos": 12896528135040.0, + "grad_norm": 1.9933093101361292, + "language_loss": 0.64877725, + "learning_rate": 2.7842871596935725e-07, + "loss": 0.67077839, + "num_input_tokens_seen": 150064135, + "step": 6945, + "time_per_iteration": 2.944234848022461 + }, + { + "auxiliary_loss_clip": 0.0111071, + "auxiliary_loss_mlp": 0.01084134, + "balance_loss_clip": 1.02528763, + "balance_loss_mlp": 1.00397754, + "epoch": 0.8352071183791259, + "flos": 26505522535680.0, + "grad_norm": 1.4657281572865164, + "language_loss": 0.68870562, + "learning_rate": 2.780323751836682e-07, + "loss": 0.71065408, + "num_input_tokens_seen": 150085350, + "step": 6946, + "time_per_iteration": 2.7324559688568115 + }, + { + "auxiliary_loss_clip": 0.01118681, + "auxiliary_loss_mlp": 0.00872797, + "balance_loss_clip": 1.02553272, + "balance_loss_mlp": 1.00012565, + "epoch": 0.8353273612697649, + "flos": 20668063754880.0, + "grad_norm": 1.4845632306998309, + "language_loss": 0.78724152, + "learning_rate": 2.7763629561909876e-07, + "loss": 0.80715626, + "num_input_tokens_seen": 150106180, + "step": 6947, + "time_per_iteration": 2.8337154388427734 + }, + { + "auxiliary_loss_clip": 0.01133877, + "auxiliary_loss_mlp": 0.01084907, + "balance_loss_clip": 1.02494204, + "balance_loss_mlp": 1.00479794, + "epoch": 0.835447604160404, + "flos": 19754137082880.0, + "grad_norm": 1.981271108507569, + "language_loss": 0.77318931, + "learning_rate": 2.772404773357335e-07, + "loss": 0.79537714, + "num_input_tokens_seen": 150125585, + "step": 6948, + "time_per_iteration": 2.631648302078247 + }, + { + "auxiliary_loss_clip": 0.01105319, + "auxiliary_loss_mlp": 0.0108381, + "balance_loss_clip": 1.02186215, + "balance_loss_mlp": 1.00370169, + "epoch": 0.8355678470510431, + "flos": 23435842239360.0, + "grad_norm": 1.8157463175195783, + "language_loss": 0.78207195, + "learning_rate": 2.7684492039361853e-07, + "loss": 0.80396318, + "num_input_tokens_seen": 150144810, + "step": 6949, + "time_per_iteration": 2.807415008544922 + }, + { + "auxiliary_loss_clip": 0.01135354, + "auxiliary_loss_mlp": 0.01084152, + "balance_loss_clip": 1.02609503, + "balance_loss_mlp": 1.00404334, + "epoch": 0.8356880899416822, + "flos": 21214588164480.0, + "grad_norm": 2.5137880904085574, + "language_loss": 0.83573544, + "learning_rate": 2.764496248527586e-07, + "loss": 0.85793048, + "num_input_tokens_seen": 150163785, + "step": 6950, + "time_per_iteration": 2.6584677696228027 + }, + { + "auxiliary_loss_clip": 0.01093222, + "auxiliary_loss_mlp": 0.01084652, + "balance_loss_clip": 1.02506912, + "balance_loss_mlp": 1.00449586, + "epoch": 0.8358083328323213, + "flos": 28037543466240.0, + "grad_norm": 2.3009800363225525, + "language_loss": 0.78630894, + "learning_rate": 2.760545907731211e-07, + "loss": 0.80808771, + "num_input_tokens_seen": 150184360, + "step": 6951, + "time_per_iteration": 2.8448755741119385 + }, + { + "auxiliary_loss_clip": 0.01126979, + "auxiliary_loss_mlp": 0.010838, + "balance_loss_clip": 1.02536309, + "balance_loss_mlp": 1.00364339, + "epoch": 0.8359285757229604, + "flos": 27783655159680.0, + "grad_norm": 1.6049186648644687, + "language_loss": 0.67904043, + "learning_rate": 2.75659818214631e-07, + "loss": 0.70114827, + "num_input_tokens_seen": 150205465, + "step": 6952, + "time_per_iteration": 2.6982412338256836 + }, + { + "auxiliary_loss_clip": 0.01116905, + "auxiliary_loss_mlp": 0.01084233, + "balance_loss_clip": 1.02466083, + "balance_loss_mlp": 1.00407708, + "epoch": 0.8360488186135995, + "flos": 21435115714560.0, + "grad_norm": 1.7282246737059088, + "language_loss": 0.78040552, + "learning_rate": 2.752653072371749e-07, + "loss": 0.80241692, + "num_input_tokens_seen": 150224900, + "step": 6953, + "time_per_iteration": 3.744593858718872 + }, + { + "auxiliary_loss_clip": 0.01104049, + "auxiliary_loss_mlp": 0.01085207, + "balance_loss_clip": 1.02289712, + "balance_loss_mlp": 1.00514567, + "epoch": 0.8361690615042385, + "flos": 27632327160960.0, + "grad_norm": 1.6341092238511297, + "language_loss": 0.74725068, + "learning_rate": 2.7487105790060105e-07, + "loss": 0.76914328, + "num_input_tokens_seen": 150244310, + "step": 6954, + "time_per_iteration": 3.7723400592803955 + }, + { + "auxiliary_loss_clip": 0.0112573, + "auxiliary_loss_mlp": 0.01083499, + "balance_loss_clip": 1.02456951, + "balance_loss_mlp": 1.00343788, + "epoch": 0.8362893043948777, + "flos": 39202529598720.0, + "grad_norm": 1.945626437253537, + "language_loss": 0.69143695, + "learning_rate": 2.7447707026471587e-07, + "loss": 0.71352923, + "num_input_tokens_seen": 150267285, + "step": 6955, + "time_per_iteration": 2.780977725982666 + }, + { + "auxiliary_loss_clip": 0.01105607, + "auxiliary_loss_mlp": 0.01084297, + "balance_loss_clip": 1.02279699, + "balance_loss_mlp": 1.00423574, + "epoch": 0.8364095472855168, + "flos": 24785329230720.0, + "grad_norm": 2.6259298779033546, + "language_loss": 0.79958457, + "learning_rate": 2.740833443892874e-07, + "loss": 0.82148361, + "num_input_tokens_seen": 150285455, + "step": 6956, + "time_per_iteration": 2.8262689113616943 + }, + { + "auxiliary_loss_clip": 0.01116914, + "auxiliary_loss_mlp": 0.01083859, + "balance_loss_clip": 1.02517092, + "balance_loss_mlp": 1.00375032, + "epoch": 0.8365297901761558, + "flos": 22743412784640.0, + "grad_norm": 1.8777111533297723, + "language_loss": 0.7969178, + "learning_rate": 2.7368988033404327e-07, + "loss": 0.8189255, + "num_input_tokens_seen": 150302970, + "step": 6957, + "time_per_iteration": 3.677581310272217 + }, + { + "auxiliary_loss_clip": 0.01106775, + "auxiliary_loss_mlp": 0.01083262, + "balance_loss_clip": 1.02400649, + "balance_loss_mlp": 1.0032959, + "epoch": 0.836650033066795, + "flos": 28396003242240.0, + "grad_norm": 1.6166196125143826, + "language_loss": 0.84559333, + "learning_rate": 2.732966781586712e-07, + "loss": 0.86749375, + "num_input_tokens_seen": 150322715, + "step": 6958, + "time_per_iteration": 2.720675468444824 + }, + { + "auxiliary_loss_clip": 0.01126118, + "auxiliary_loss_mlp": 0.01083837, + "balance_loss_clip": 1.02530098, + "balance_loss_mlp": 1.00372827, + "epoch": 0.836770275957434, + "flos": 22236857233920.0, + "grad_norm": 1.6563057635958685, + "language_loss": 0.66419208, + "learning_rate": 2.729037379228205e-07, + "loss": 0.68629164, + "num_input_tokens_seen": 150342900, + "step": 6959, + "time_per_iteration": 2.725029945373535 + }, + { + "auxiliary_loss_clip": 0.01115109, + "auxiliary_loss_mlp": 0.01083529, + "balance_loss_clip": 1.02401447, + "balance_loss_mlp": 1.00342047, + "epoch": 0.8368905188480731, + "flos": 22491930689280.0, + "grad_norm": 1.541182741317729, + "language_loss": 0.80591524, + "learning_rate": 2.725110596860998e-07, + "loss": 0.8279016, + "num_input_tokens_seen": 150363580, + "step": 6960, + "time_per_iteration": 3.757946014404297 + }, + { + "auxiliary_loss_clip": 0.01098922, + "auxiliary_loss_mlp": 0.01084412, + "balance_loss_clip": 1.02468216, + "balance_loss_mlp": 1.00435102, + "epoch": 0.8370107617387123, + "flos": 13370405287680.0, + "grad_norm": 1.8630098979369283, + "language_loss": 0.6997509, + "learning_rate": 2.7211864350807776e-07, + "loss": 0.72158426, + "num_input_tokens_seen": 150381780, + "step": 6961, + "time_per_iteration": 2.821958065032959 + }, + { + "auxiliary_loss_clip": 0.01135001, + "auxiliary_loss_mlp": 0.0108449, + "balance_loss_clip": 1.02585387, + "balance_loss_mlp": 1.00433326, + "epoch": 0.8371310046293513, + "flos": 25261289372160.0, + "grad_norm": 1.7055541025615522, + "language_loss": 0.73479414, + "learning_rate": 2.717264894482836e-07, + "loss": 0.75698906, + "num_input_tokens_seen": 150402120, + "step": 6962, + "time_per_iteration": 2.7382946014404297 + }, + { + "auxiliary_loss_clip": 0.01124594, + "auxiliary_loss_mlp": 0.01084525, + "balance_loss_clip": 1.02450526, + "balance_loss_mlp": 1.00436831, + "epoch": 0.8372512475199904, + "flos": 19792705311360.0, + "grad_norm": 2.4188612231770743, + "language_loss": 0.80644643, + "learning_rate": 2.7133459756620646e-07, + "loss": 0.82853758, + "num_input_tokens_seen": 150419315, + "step": 6963, + "time_per_iteration": 2.670165538787842 + }, + { + "auxiliary_loss_clip": 0.01126866, + "auxiliary_loss_mlp": 0.01083977, + "balance_loss_clip": 1.02609324, + "balance_loss_mlp": 1.00391579, + "epoch": 0.8373714904106295, + "flos": 19391224020480.0, + "grad_norm": 1.6893704561557505, + "language_loss": 0.73626256, + "learning_rate": 2.7094296792129733e-07, + "loss": 0.758371, + "num_input_tokens_seen": 150438915, + "step": 6964, + "time_per_iteration": 2.7165040969848633 + }, + { + "auxiliary_loss_clip": 0.01126414, + "auxiliary_loss_mlp": 0.01083501, + "balance_loss_clip": 1.02599037, + "balance_loss_mlp": 1.00348735, + "epoch": 0.8374917333012686, + "flos": 14975935401600.0, + "grad_norm": 1.7435255831449556, + "language_loss": 0.75425637, + "learning_rate": 2.7055160057296424e-07, + "loss": 0.7763555, + "num_input_tokens_seen": 150456155, + "step": 6965, + "time_per_iteration": 2.602750062942505 + }, + { + "auxiliary_loss_clip": 0.01108335, + "auxiliary_loss_mlp": 0.01084669, + "balance_loss_clip": 1.02463388, + "balance_loss_mlp": 1.00446451, + "epoch": 0.8376119761919076, + "flos": 30331839847680.0, + "grad_norm": 1.512901582237152, + "language_loss": 0.7198779, + "learning_rate": 2.7016049558057896e-07, + "loss": 0.74180794, + "num_input_tokens_seen": 150478115, + "step": 6966, + "time_per_iteration": 2.880807399749756 + }, + { + "auxiliary_loss_clip": 0.01125017, + "auxiliary_loss_mlp": 0.0108297, + "balance_loss_clip": 1.02522373, + "balance_loss_mlp": 1.00281382, + "epoch": 0.8377322190825467, + "flos": 29423336129280.0, + "grad_norm": 1.6969865748320463, + "language_loss": 0.7089628, + "learning_rate": 2.6976965300347074e-07, + "loss": 0.73104274, + "num_input_tokens_seen": 150500725, + "step": 6967, + "time_per_iteration": 2.7397470474243164 + }, + { + "auxiliary_loss_clip": 0.01115118, + "auxiliary_loss_mlp": 0.01084002, + "balance_loss_clip": 1.02333713, + "balance_loss_mlp": 1.00389302, + "epoch": 0.8378524619731859, + "flos": 26687086807680.0, + "grad_norm": 2.2688470712868356, + "language_loss": 0.69085705, + "learning_rate": 2.693790729009309e-07, + "loss": 0.71284825, + "num_input_tokens_seen": 150522335, + "step": 6968, + "time_per_iteration": 2.8046202659606934 + }, + { + "auxiliary_loss_clip": 0.01115601, + "auxiliary_loss_mlp": 0.01083181, + "balance_loss_clip": 1.02412701, + "balance_loss_mlp": 1.00302434, + "epoch": 0.8379727048638249, + "flos": 20703866636160.0, + "grad_norm": 1.7994119063926046, + "language_loss": 0.88295257, + "learning_rate": 2.6898875533220946e-07, + "loss": 0.90494037, + "num_input_tokens_seen": 150541640, + "step": 6969, + "time_per_iteration": 2.7160356044769287 + }, + { + "auxiliary_loss_clip": 0.01135166, + "auxiliary_loss_mlp": 0.01083312, + "balance_loss_clip": 1.02644444, + "balance_loss_mlp": 1.00325072, + "epoch": 0.838092947754464, + "flos": 20084084438400.0, + "grad_norm": 1.7679255852245663, + "language_loss": 0.81981063, + "learning_rate": 2.685987003565171e-07, + "loss": 0.84199548, + "num_input_tokens_seen": 150559680, + "step": 6970, + "time_per_iteration": 2.6369924545288086 + }, + { + "auxiliary_loss_clip": 0.01098941, + "auxiliary_loss_mlp": 0.01083889, + "balance_loss_clip": 1.01948977, + "balance_loss_mlp": 1.00373292, + "epoch": 0.8382131906451031, + "flos": 18113270964480.0, + "grad_norm": 2.386479952180344, + "language_loss": 0.74873096, + "learning_rate": 2.6820890803302566e-07, + "loss": 0.77055925, + "num_input_tokens_seen": 150575205, + "step": 6971, + "time_per_iteration": 2.719763994216919 + }, + { + "auxiliary_loss_clip": 0.01095755, + "auxiliary_loss_mlp": 0.01084097, + "balance_loss_clip": 1.02260351, + "balance_loss_mlp": 1.00398862, + "epoch": 0.8383334335357422, + "flos": 17092653920640.0, + "grad_norm": 2.2984751213070314, + "language_loss": 0.82342625, + "learning_rate": 2.6781937842086557e-07, + "loss": 0.84522474, + "num_input_tokens_seen": 150593995, + "step": 6972, + "time_per_iteration": 2.666154384613037 + }, + { + "auxiliary_loss_clip": 0.01126102, + "auxiliary_loss_mlp": 0.01083913, + "balance_loss_clip": 1.02498579, + "balance_loss_mlp": 1.00380468, + "epoch": 0.8384536764263812, + "flos": 20704728562560.0, + "grad_norm": 1.7407526959648467, + "language_loss": 0.67718107, + "learning_rate": 2.6743011157912933e-07, + "loss": 0.69928122, + "num_input_tokens_seen": 150613715, + "step": 6973, + "time_per_iteration": 2.6999175548553467 + }, + { + "auxiliary_loss_clip": 0.01101099, + "auxiliary_loss_mlp": 0.01084588, + "balance_loss_clip": 1.02492857, + "balance_loss_mlp": 1.00438392, + "epoch": 0.8385739193170204, + "flos": 28986842056320.0, + "grad_norm": 2.0162580900987632, + "language_loss": 0.65231609, + "learning_rate": 2.6704110756686725e-07, + "loss": 0.674173, + "num_input_tokens_seen": 150634540, + "step": 6974, + "time_per_iteration": 2.8320705890655518 + }, + { + "auxiliary_loss_clip": 0.0111872, + "auxiliary_loss_mlp": 0.00872904, + "balance_loss_clip": 1.02527809, + "balance_loss_mlp": 1.00007248, + "epoch": 0.8386941622076595, + "flos": 23438068882560.0, + "grad_norm": 2.1833603588273687, + "language_loss": 0.83845633, + "learning_rate": 2.6665236644309085e-07, + "loss": 0.85837257, + "num_input_tokens_seen": 150654850, + "step": 6975, + "time_per_iteration": 2.785414934158325 + }, + { + "auxiliary_loss_clip": 0.01127674, + "auxiliary_loss_mlp": 0.01084148, + "balance_loss_clip": 1.0265615, + "balance_loss_mlp": 1.00399208, + "epoch": 0.8388144050982985, + "flos": 23002724044800.0, + "grad_norm": 1.8369390761001039, + "language_loss": 0.79597938, + "learning_rate": 2.662638882667727e-07, + "loss": 0.81809759, + "num_input_tokens_seen": 150673790, + "step": 6976, + "time_per_iteration": 2.6869075298309326 + }, + { + "auxiliary_loss_clip": 0.01135822, + "auxiliary_loss_mlp": 0.01084831, + "balance_loss_clip": 1.02611864, + "balance_loss_mlp": 1.00462723, + "epoch": 0.8389346479889377, + "flos": 24280353878400.0, + "grad_norm": 1.8173928906307115, + "language_loss": 0.73397946, + "learning_rate": 2.658756730968443e-07, + "loss": 0.75618595, + "num_input_tokens_seen": 150692255, + "step": 6977, + "time_per_iteration": 2.675663709640503 + }, + { + "auxiliary_loss_clip": 0.01116048, + "auxiliary_loss_mlp": 0.01085069, + "balance_loss_clip": 1.02456832, + "balance_loss_mlp": 1.00491297, + "epoch": 0.8390548908795767, + "flos": 21215019127680.0, + "grad_norm": 1.9665104163339846, + "language_loss": 0.88194835, + "learning_rate": 2.654877209921975e-07, + "loss": 0.90395951, + "num_input_tokens_seen": 150709790, + "step": 6978, + "time_per_iteration": 2.704040765762329 + }, + { + "auxiliary_loss_clip": 0.01101248, + "auxiliary_loss_mlp": 0.010849, + "balance_loss_clip": 1.02412224, + "balance_loss_mlp": 1.00474417, + "epoch": 0.8391751337702158, + "flos": 35627299332480.0, + "grad_norm": 2.0092133195046116, + "language_loss": 0.62471563, + "learning_rate": 2.651000320116843e-07, + "loss": 0.64657712, + "num_input_tokens_seen": 150730675, + "step": 6979, + "time_per_iteration": 3.8335306644439697 + }, + { + "auxiliary_loss_clip": 0.01107425, + "auxiliary_loss_mlp": 0.00872965, + "balance_loss_clip": 1.02384281, + "balance_loss_mlp": 1.00009608, + "epoch": 0.839295376660855, + "flos": 21325229032320.0, + "grad_norm": 1.7248365164747563, + "language_loss": 0.7580415, + "learning_rate": 2.647126062141163e-07, + "loss": 0.77784538, + "num_input_tokens_seen": 150749750, + "step": 6980, + "time_per_iteration": 2.790391445159912 + }, + { + "auxiliary_loss_clip": 0.01118276, + "auxiliary_loss_mlp": 0.01084332, + "balance_loss_clip": 1.02543962, + "balance_loss_mlp": 1.00422287, + "epoch": 0.839415619551494, + "flos": 18442535961600.0, + "grad_norm": 1.7065394376378913, + "language_loss": 0.83895099, + "learning_rate": 2.643254436582669e-07, + "loss": 0.86097711, + "num_input_tokens_seen": 150769240, + "step": 6981, + "time_per_iteration": 2.671243667602539 + }, + { + "auxiliary_loss_clip": 0.01090203, + "auxiliary_loss_mlp": 0.01083014, + "balance_loss_clip": 1.0213232, + "balance_loss_mlp": 1.00285816, + "epoch": 0.8395358624421331, + "flos": 23221958705280.0, + "grad_norm": 2.1561349479128165, + "language_loss": 0.823443, + "learning_rate": 2.6393854440286743e-07, + "loss": 0.84517515, + "num_input_tokens_seen": 150788410, + "step": 6982, + "time_per_iteration": 3.7858355045318604 + }, + { + "auxiliary_loss_clip": 0.01136592, + "auxiliary_loss_mlp": 0.01083975, + "balance_loss_clip": 1.02755272, + "balance_loss_mlp": 1.00391364, + "epoch": 0.8396561053327722, + "flos": 24381657210240.0, + "grad_norm": 2.1232304820223216, + "language_loss": 0.70729423, + "learning_rate": 2.6355190850661045e-07, + "loss": 0.72949988, + "num_input_tokens_seen": 150805245, + "step": 6983, + "time_per_iteration": 2.645803928375244 + }, + { + "auxiliary_loss_clip": 0.01118812, + "auxiliary_loss_mlp": 0.01083141, + "balance_loss_clip": 1.02666557, + "balance_loss_mlp": 1.00307977, + "epoch": 0.8397763482234113, + "flos": 22237755073920.0, + "grad_norm": 1.516124782120157, + "language_loss": 0.86334062, + "learning_rate": 2.631655360281486e-07, + "loss": 0.88536018, + "num_input_tokens_seen": 150824920, + "step": 6984, + "time_per_iteration": 2.707502841949463 + }, + { + "auxiliary_loss_clip": 0.01110581, + "auxiliary_loss_mlp": 0.00872961, + "balance_loss_clip": 1.02569366, + "balance_loss_mlp": 1.00007248, + "epoch": 0.8398965911140504, + "flos": 22163743100160.0, + "grad_norm": 1.8111978034967273, + "language_loss": 0.65584826, + "learning_rate": 2.6277942702609323e-07, + "loss": 0.67568374, + "num_input_tokens_seen": 150844400, + "step": 6985, + "time_per_iteration": 2.718569755554199 + }, + { + "auxiliary_loss_clip": 0.01106801, + "auxiliary_loss_mlp": 0.01084213, + "balance_loss_clip": 1.02406037, + "balance_loss_mlp": 1.00405669, + "epoch": 0.8400168340046895, + "flos": 21542775753600.0, + "grad_norm": 1.8394204828435445, + "language_loss": 0.87201428, + "learning_rate": 2.623935815590186e-07, + "loss": 0.89392436, + "num_input_tokens_seen": 150862780, + "step": 6986, + "time_per_iteration": 3.6393113136291504 + }, + { + "auxiliary_loss_clip": 0.01116828, + "auxiliary_loss_mlp": 0.0108413, + "balance_loss_clip": 1.02469027, + "balance_loss_mlp": 1.00402188, + "epoch": 0.8401370768953286, + "flos": 22491966602880.0, + "grad_norm": 3.650289108662565, + "language_loss": 0.80693042, + "learning_rate": 2.6200799968545516e-07, + "loss": 0.82894003, + "num_input_tokens_seen": 150883075, + "step": 6987, + "time_per_iteration": 2.79217791557312 + }, + { + "auxiliary_loss_clip": 0.01092603, + "auxiliary_loss_mlp": 0.01079039, + "balance_loss_clip": 1.0227387, + "balance_loss_mlp": 1.00007486, + "epoch": 0.8402573197859676, + "flos": 59238890818560.0, + "grad_norm": 0.7853838815942537, + "language_loss": 0.56466591, + "learning_rate": 2.616226814638969e-07, + "loss": 0.58638239, + "num_input_tokens_seen": 150948180, + "step": 6988, + "time_per_iteration": 3.3276917934417725 + }, + { + "auxiliary_loss_clip": 0.01100155, + "auxiliary_loss_mlp": 0.01085016, + "balance_loss_clip": 1.02501094, + "balance_loss_mlp": 1.00495493, + "epoch": 0.8403775626766068, + "flos": 22674608282880.0, + "grad_norm": 1.8940941259522026, + "language_loss": 0.77088678, + "learning_rate": 2.612376269527954e-07, + "loss": 0.79273856, + "num_input_tokens_seen": 150967885, + "step": 6989, + "time_per_iteration": 2.7017860412597656 + }, + { + "auxiliary_loss_clip": 0.01114789, + "auxiliary_loss_mlp": 0.01084582, + "balance_loss_clip": 1.02443171, + "balance_loss_mlp": 1.00442553, + "epoch": 0.8404978055672458, + "flos": 19609704495360.0, + "grad_norm": 1.632960050391164, + "language_loss": 0.6760478, + "learning_rate": 2.608528362105635e-07, + "loss": 0.69804156, + "num_input_tokens_seen": 150987255, + "step": 6990, + "time_per_iteration": 2.720673084259033 + }, + { + "auxiliary_loss_clip": 0.01107704, + "auxiliary_loss_mlp": 0.01085771, + "balance_loss_clip": 1.02360642, + "balance_loss_mlp": 1.00561476, + "epoch": 0.8406180484578849, + "flos": 27526929678720.0, + "grad_norm": 1.6531444052084514, + "language_loss": 0.72687054, + "learning_rate": 2.6046830929557374e-07, + "loss": 0.74880528, + "num_input_tokens_seen": 151006905, + "step": 6991, + "time_per_iteration": 2.8490493297576904 + }, + { + "auxiliary_loss_clip": 0.01107164, + "auxiliary_loss_mlp": 0.01086397, + "balance_loss_clip": 1.02367556, + "balance_loss_mlp": 1.00614524, + "epoch": 0.8407382913485241, + "flos": 22127473342080.0, + "grad_norm": 2.0421563790796906, + "language_loss": 0.85002065, + "learning_rate": 2.6008404626615776e-07, + "loss": 0.87195623, + "num_input_tokens_seen": 151025405, + "step": 6992, + "time_per_iteration": 2.753281354904175 + }, + { + "auxiliary_loss_clip": 0.01129066, + "auxiliary_loss_mlp": 0.0108432, + "balance_loss_clip": 1.02784991, + "balance_loss_mlp": 1.00425887, + "epoch": 0.8408585342391631, + "flos": 13918473982080.0, + "grad_norm": 2.3800395034731623, + "language_loss": 0.73775637, + "learning_rate": 2.597000471806092e-07, + "loss": 0.7598902, + "num_input_tokens_seen": 151041970, + "step": 6993, + "time_per_iteration": 2.617889642715454 + }, + { + "auxiliary_loss_clip": 0.01109653, + "auxiliary_loss_mlp": 0.01084469, + "balance_loss_clip": 1.02448595, + "balance_loss_mlp": 1.00426531, + "epoch": 0.8409787771298022, + "flos": 20187865808640.0, + "grad_norm": 2.086680978301909, + "language_loss": 0.73096299, + "learning_rate": 2.593163120971793e-07, + "loss": 0.75290424, + "num_input_tokens_seen": 151060835, + "step": 6994, + "time_per_iteration": 2.7280774116516113 + }, + { + "auxiliary_loss_clip": 0.01100602, + "auxiliary_loss_mlp": 0.01083332, + "balance_loss_clip": 1.02453351, + "balance_loss_mlp": 1.0032233, + "epoch": 0.8410990200204413, + "flos": 23142523777920.0, + "grad_norm": 1.9015588687113025, + "language_loss": 0.69066095, + "learning_rate": 2.5893284107408165e-07, + "loss": 0.71250027, + "num_input_tokens_seen": 151078205, + "step": 6995, + "time_per_iteration": 2.7769582271575928 + }, + { + "auxiliary_loss_clip": 0.01092476, + "auxiliary_loss_mlp": 0.01085082, + "balance_loss_clip": 1.02042234, + "balance_loss_mlp": 1.00492573, + "epoch": 0.8412192629110804, + "flos": 24027219757440.0, + "grad_norm": 1.8001342159184919, + "language_loss": 0.77899528, + "learning_rate": 2.5854963416948726e-07, + "loss": 0.80077088, + "num_input_tokens_seen": 151100470, + "step": 6996, + "time_per_iteration": 2.8592488765716553 + }, + { + "auxiliary_loss_clip": 0.01085356, + "auxiliary_loss_mlp": 0.01084484, + "balance_loss_clip": 1.02483082, + "balance_loss_mlp": 1.00432754, + "epoch": 0.8413395058017195, + "flos": 25591703604480.0, + "grad_norm": 1.7231124720233115, + "language_loss": 0.69390845, + "learning_rate": 2.5816669144152816e-07, + "loss": 0.71560681, + "num_input_tokens_seen": 151121650, + "step": 6997, + "time_per_iteration": 2.911762237548828 + }, + { + "auxiliary_loss_clip": 0.01113142, + "auxiliary_loss_mlp": 0.01079193, + "balance_loss_clip": 1.01797962, + "balance_loss_mlp": 1.00022888, + "epoch": 0.8414597486923585, + "flos": 63635396624640.0, + "grad_norm": 0.8478001392780845, + "language_loss": 0.66337252, + "learning_rate": 2.5778401294829777e-07, + "loss": 0.68529588, + "num_input_tokens_seen": 151180390, + "step": 6998, + "time_per_iteration": 3.2644882202148438 + }, + { + "auxiliary_loss_clip": 0.01126254, + "auxiliary_loss_mlp": 0.00872837, + "balance_loss_clip": 1.02591705, + "balance_loss_mlp": 1.00008821, + "epoch": 0.8415799915829977, + "flos": 19098731571840.0, + "grad_norm": 1.8755984154551257, + "language_loss": 0.648996, + "learning_rate": 2.574015987478473e-07, + "loss": 0.66898692, + "num_input_tokens_seen": 151198520, + "step": 6999, + "time_per_iteration": 2.6825003623962402 + }, + { + "auxiliary_loss_clip": 0.01116315, + "auxiliary_loss_mlp": 0.01084354, + "balance_loss_clip": 1.02407587, + "balance_loss_mlp": 1.00415027, + "epoch": 0.8417002344736367, + "flos": 19821612781440.0, + "grad_norm": 1.9428848439171094, + "language_loss": 0.86686432, + "learning_rate": 2.570194488981887e-07, + "loss": 0.88887101, + "num_input_tokens_seen": 151215065, + "step": 7000, + "time_per_iteration": 2.666337013244629 + }, + { + "auxiliary_loss_clip": 0.01113285, + "auxiliary_loss_mlp": 0.01079, + "balance_loss_clip": 1.01810098, + "balance_loss_mlp": 1.00003576, + "epoch": 0.8418204773642758, + "flos": 62161516834560.0, + "grad_norm": 0.8401507251260361, + "language_loss": 0.60360539, + "learning_rate": 2.566375634572939e-07, + "loss": 0.62552822, + "num_input_tokens_seen": 151275705, + "step": 7001, + "time_per_iteration": 3.1781392097473145 + }, + { + "auxiliary_loss_clip": 0.01108439, + "auxiliary_loss_mlp": 0.01083916, + "balance_loss_clip": 1.02440381, + "balance_loss_mlp": 1.00371218, + "epoch": 0.841940720254915, + "flos": 17092905315840.0, + "grad_norm": 1.7342756686824563, + "language_loss": 0.7652719, + "learning_rate": 2.562559424830943e-07, + "loss": 0.7871955, + "num_input_tokens_seen": 151293665, + "step": 7002, + "time_per_iteration": 2.722668170928955 + }, + { + "auxiliary_loss_clip": 0.0111902, + "auxiliary_loss_mlp": 0.01084389, + "balance_loss_clip": 1.02572, + "balance_loss_mlp": 1.00432777, + "epoch": 0.842060963145554, + "flos": 16283586026880.0, + "grad_norm": 1.8513399196668503, + "language_loss": 0.70134723, + "learning_rate": 2.5587458603348256e-07, + "loss": 0.72338134, + "num_input_tokens_seen": 151310955, + "step": 7003, + "time_per_iteration": 2.7502048015594482 + }, + { + "auxiliary_loss_clip": 0.01100508, + "auxiliary_loss_mlp": 0.01083801, + "balance_loss_clip": 1.02294981, + "balance_loss_mlp": 1.00373995, + "epoch": 0.8421812060361931, + "flos": 21908238681600.0, + "grad_norm": 1.8639405395356654, + "language_loss": 0.84121346, + "learning_rate": 2.554934941663085e-07, + "loss": 0.86305654, + "num_input_tokens_seen": 151328490, + "step": 7004, + "time_per_iteration": 3.641467571258545 + }, + { + "auxiliary_loss_clip": 0.01100212, + "auxiliary_loss_mlp": 0.01084553, + "balance_loss_clip": 1.02337217, + "balance_loss_mlp": 1.00439668, + "epoch": 0.8423014489268322, + "flos": 27777693502080.0, + "grad_norm": 2.2526799924143934, + "language_loss": 0.73322779, + "learning_rate": 2.5511266693938484e-07, + "loss": 0.75507545, + "num_input_tokens_seen": 151346950, + "step": 7005, + "time_per_iteration": 3.6929519176483154 + }, + { + "auxiliary_loss_clip": 0.01108515, + "auxiliary_loss_mlp": 0.01083939, + "balance_loss_clip": 1.02428067, + "balance_loss_mlp": 1.00373483, + "epoch": 0.8424216918174713, + "flos": 25117610970240.0, + "grad_norm": 1.5075973440621038, + "language_loss": 0.77713513, + "learning_rate": 2.547321044104822e-07, + "loss": 0.79905969, + "num_input_tokens_seen": 151368445, + "step": 7006, + "time_per_iteration": 2.8008275032043457 + }, + { + "auxiliary_loss_clip": 0.0113629, + "auxiliary_loss_mlp": 0.01084939, + "balance_loss_clip": 1.02657747, + "balance_loss_mlp": 1.00473547, + "epoch": 0.8425419347081103, + "flos": 24748448941440.0, + "grad_norm": 4.964434305503939, + "language_loss": 0.76377451, + "learning_rate": 2.5435180663733113e-07, + "loss": 0.78598678, + "num_input_tokens_seen": 151388745, + "step": 7007, + "time_per_iteration": 3.6218559741973877 + }, + { + "auxiliary_loss_clip": 0.0108521, + "auxiliary_loss_mlp": 0.0108427, + "balance_loss_clip": 1.02477813, + "balance_loss_mlp": 1.00411344, + "epoch": 0.8426621775987495, + "flos": 24820916630400.0, + "grad_norm": 2.5150778563754206, + "language_loss": 0.718436, + "learning_rate": 2.539717736776241e-07, + "loss": 0.74013078, + "num_input_tokens_seen": 151404970, + "step": 7008, + "time_per_iteration": 2.8186240196228027 + }, + { + "auxiliary_loss_clip": 0.01118879, + "auxiliary_loss_mlp": 0.01083908, + "balance_loss_clip": 1.02472925, + "balance_loss_mlp": 1.00384748, + "epoch": 0.8427824204893886, + "flos": 23550074467200.0, + "grad_norm": 1.3505904831161848, + "language_loss": 0.76434642, + "learning_rate": 2.535920055890097e-07, + "loss": 0.78637427, + "num_input_tokens_seen": 151426265, + "step": 7009, + "time_per_iteration": 2.663429021835327 + }, + { + "auxiliary_loss_clip": 0.0110153, + "auxiliary_loss_mlp": 0.01083818, + "balance_loss_clip": 1.0247823, + "balance_loss_mlp": 1.00366187, + "epoch": 0.8429026633800276, + "flos": 16143858120960.0, + "grad_norm": 2.1624072208174967, + "language_loss": 0.64266193, + "learning_rate": 2.5321250242910006e-07, + "loss": 0.6645155, + "num_input_tokens_seen": 151444180, + "step": 7010, + "time_per_iteration": 2.7403573989868164 + }, + { + "auxiliary_loss_clip": 0.01136691, + "auxiliary_loss_mlp": 0.01084643, + "balance_loss_clip": 1.02736795, + "balance_loss_mlp": 1.00453436, + "epoch": 0.8430229062706668, + "flos": 22198540400640.0, + "grad_norm": 1.8768063057542772, + "language_loss": 0.86467409, + "learning_rate": 2.5283326425546493e-07, + "loss": 0.88688743, + "num_input_tokens_seen": 151463290, + "step": 7011, + "time_per_iteration": 3.5697741508483887 + }, + { + "auxiliary_loss_clip": 0.01107386, + "auxiliary_loss_mlp": 0.010841, + "balance_loss_clip": 1.02588582, + "balance_loss_mlp": 1.00408721, + "epoch": 0.8431431491613058, + "flos": 35330317683840.0, + "grad_norm": 1.9576128608151386, + "language_loss": 0.69806039, + "learning_rate": 2.5245429112563443e-07, + "loss": 0.71997523, + "num_input_tokens_seen": 151483965, + "step": 7012, + "time_per_iteration": 2.924379348754883 + }, + { + "auxiliary_loss_clip": 0.01125716, + "auxiliary_loss_mlp": 0.01083862, + "balance_loss_clip": 1.02576089, + "balance_loss_mlp": 1.00370574, + "epoch": 0.8432633920519449, + "flos": 25812374808960.0, + "grad_norm": 1.6844099164410002, + "language_loss": 0.82049495, + "learning_rate": 2.5207558309709865e-07, + "loss": 0.84259075, + "num_input_tokens_seen": 151503700, + "step": 7013, + "time_per_iteration": 2.6930060386657715 + }, + { + "auxiliary_loss_clip": 0.01089664, + "auxiliary_loss_mlp": 0.0087297, + "balance_loss_clip": 1.01798558, + "balance_loss_mlp": 1.00140476, + "epoch": 0.8433836349425841, + "flos": 64959531592320.0, + "grad_norm": 0.6557636498668751, + "language_loss": 0.56314588, + "learning_rate": 2.516971402273065e-07, + "loss": 0.58277225, + "num_input_tokens_seen": 151569765, + "step": 7014, + "time_per_iteration": 3.3448870182037354 + }, + { + "auxiliary_loss_clip": 0.01116601, + "auxiliary_loss_mlp": 0.01083706, + "balance_loss_clip": 1.02472758, + "balance_loss_mlp": 1.00355005, + "epoch": 0.8435038778332231, + "flos": 20229989483520.0, + "grad_norm": 1.9261189727099899, + "language_loss": 0.67739856, + "learning_rate": 2.513189625736687e-07, + "loss": 0.69940162, + "num_input_tokens_seen": 151586660, + "step": 7015, + "time_per_iteration": 2.7280778884887695 + }, + { + "auxiliary_loss_clip": 0.01107799, + "auxiliary_loss_mlp": 0.01083885, + "balance_loss_clip": 1.02395856, + "balance_loss_mlp": 1.00372863, + "epoch": 0.8436241207238622, + "flos": 20992229020800.0, + "grad_norm": 30.64005163471333, + "language_loss": 0.71820879, + "learning_rate": 2.509410501935534e-07, + "loss": 0.7401256, + "num_input_tokens_seen": 151602295, + "step": 7016, + "time_per_iteration": 2.813591480255127 + }, + { + "auxiliary_loss_clip": 0.01116958, + "auxiliary_loss_mlp": 0.01084223, + "balance_loss_clip": 1.02521682, + "balance_loss_mlp": 1.00397122, + "epoch": 0.8437443636145013, + "flos": 14682257804160.0, + "grad_norm": 2.6145901248862184, + "language_loss": 0.75040913, + "learning_rate": 2.5056340314429116e-07, + "loss": 0.77242088, + "num_input_tokens_seen": 151619760, + "step": 7017, + "time_per_iteration": 2.760416030883789 + }, + { + "auxiliary_loss_clip": 0.01100117, + "auxiliary_loss_mlp": 0.0108464, + "balance_loss_clip": 1.02339125, + "balance_loss_mlp": 1.0044359, + "epoch": 0.8438646065051404, + "flos": 21608814908160.0, + "grad_norm": 3.6785664818214245, + "language_loss": 0.80127895, + "learning_rate": 2.5018602148316904e-07, + "loss": 0.82312649, + "num_input_tokens_seen": 151635795, + "step": 7018, + "time_per_iteration": 2.8104305267333984 + }, + { + "auxiliary_loss_clip": 0.01135592, + "auxiliary_loss_mlp": 0.01084486, + "balance_loss_clip": 1.02672696, + "balance_loss_mlp": 1.00437713, + "epoch": 0.8439848493957794, + "flos": 23289937194240.0, + "grad_norm": 1.6592547087969138, + "language_loss": 0.80238211, + "learning_rate": 2.498089052674359e-07, + "loss": 0.82458293, + "num_input_tokens_seen": 151653770, + "step": 7019, + "time_per_iteration": 2.7091331481933594 + }, + { + "auxiliary_loss_clip": 0.01126842, + "auxiliary_loss_mlp": 0.01083871, + "balance_loss_clip": 1.02678585, + "balance_loss_mlp": 1.00371468, + "epoch": 0.8441050922864186, + "flos": 19719339782400.0, + "grad_norm": 1.7948543035355682, + "language_loss": 0.75182605, + "learning_rate": 2.494320545543007e-07, + "loss": 0.77393317, + "num_input_tokens_seen": 151673340, + "step": 7020, + "time_per_iteration": 2.66807222366333 + }, + { + "auxiliary_loss_clip": 0.01136446, + "auxiliary_loss_mlp": 0.01084423, + "balance_loss_clip": 1.02658975, + "balance_loss_mlp": 1.00426722, + "epoch": 0.8442253351770577, + "flos": 21835268202240.0, + "grad_norm": 1.5967899332050008, + "language_loss": 0.66624749, + "learning_rate": 2.490554694009308e-07, + "loss": 0.68845618, + "num_input_tokens_seen": 151694205, + "step": 7021, + "time_per_iteration": 2.6682252883911133 + }, + { + "auxiliary_loss_clip": 0.01127308, + "auxiliary_loss_mlp": 0.01084419, + "balance_loss_clip": 1.02557504, + "balance_loss_mlp": 1.00431037, + "epoch": 0.8443455780676967, + "flos": 34346365447680.0, + "grad_norm": 1.4996480577587252, + "language_loss": 0.78438079, + "learning_rate": 2.4867914986445426e-07, + "loss": 0.80649805, + "num_input_tokens_seen": 151716595, + "step": 7022, + "time_per_iteration": 2.7769527435302734 + }, + { + "auxiliary_loss_clip": 0.01118371, + "auxiliary_loss_mlp": 0.01084751, + "balance_loss_clip": 1.02532244, + "balance_loss_mlp": 1.00464249, + "epoch": 0.8444658209583359, + "flos": 48214599281280.0, + "grad_norm": 2.1308811477249123, + "language_loss": 0.70739597, + "learning_rate": 2.483030960019581e-07, + "loss": 0.72942722, + "num_input_tokens_seen": 151740525, + "step": 7023, + "time_per_iteration": 3.000978708267212 + }, + { + "auxiliary_loss_clip": 0.01081206, + "auxiliary_loss_mlp": 0.01079151, + "balance_loss_clip": 1.01880193, + "balance_loss_mlp": 1.00018656, + "epoch": 0.8445860638489749, + "flos": 68484773105280.0, + "grad_norm": 0.730386722309085, + "language_loss": 0.55493152, + "learning_rate": 2.479273078704891e-07, + "loss": 0.57653505, + "num_input_tokens_seen": 151793890, + "step": 7024, + "time_per_iteration": 3.2948577404022217 + }, + { + "auxiliary_loss_clip": 0.01077775, + "auxiliary_loss_mlp": 0.01079462, + "balance_loss_clip": 1.02507138, + "balance_loss_mlp": 1.00049782, + "epoch": 0.844706306739614, + "flos": 62833331882880.0, + "grad_norm": 0.7845542126416146, + "language_loss": 0.6474607, + "learning_rate": 2.475517855270552e-07, + "loss": 0.66903305, + "num_input_tokens_seen": 151853970, + "step": 7025, + "time_per_iteration": 3.341977834701538 + }, + { + "auxiliary_loss_clip": 0.01134794, + "auxiliary_loss_mlp": 0.01084073, + "balance_loss_clip": 1.02583873, + "balance_loss_mlp": 1.00391626, + "epoch": 0.8448265496302532, + "flos": 14976114969600.0, + "grad_norm": 2.4795020646705033, + "language_loss": 0.72310603, + "learning_rate": 2.4717652902862143e-07, + "loss": 0.74529469, + "num_input_tokens_seen": 151872945, + "step": 7026, + "time_per_iteration": 2.6525697708129883 + }, + { + "auxiliary_loss_clip": 0.01098751, + "auxiliary_loss_mlp": 0.01083691, + "balance_loss_clip": 1.02374351, + "balance_loss_mlp": 1.00358248, + "epoch": 0.8449467925208922, + "flos": 23441265192960.0, + "grad_norm": 1.6095664895984296, + "language_loss": 0.81227469, + "learning_rate": 2.4680153843211495e-07, + "loss": 0.83409917, + "num_input_tokens_seen": 151892875, + "step": 7027, + "time_per_iteration": 2.691948652267456 + }, + { + "auxiliary_loss_clip": 0.01115507, + "auxiliary_loss_mlp": 0.01084625, + "balance_loss_clip": 1.02494979, + "balance_loss_mlp": 1.0045166, + "epoch": 0.8450670354115313, + "flos": 22748045639040.0, + "grad_norm": 1.6448393092732303, + "language_loss": 0.72351885, + "learning_rate": 2.464268137944212e-07, + "loss": 0.74552017, + "num_input_tokens_seen": 151914170, + "step": 7028, + "time_per_iteration": 2.7999160289764404 + }, + { + "auxiliary_loss_clip": 0.01094056, + "auxiliary_loss_mlp": 0.01084085, + "balance_loss_clip": 1.02076638, + "balance_loss_mlp": 1.00388074, + "epoch": 0.8451872783021703, + "flos": 29825571605760.0, + "grad_norm": 1.8198712908843364, + "language_loss": 0.78351164, + "learning_rate": 2.46052355172385e-07, + "loss": 0.80529302, + "num_input_tokens_seen": 151932210, + "step": 7029, + "time_per_iteration": 2.852504014968872 + }, + { + "auxiliary_loss_clip": 0.01134652, + "auxiliary_loss_mlp": 0.01084074, + "balance_loss_clip": 1.02538705, + "balance_loss_mlp": 1.00391722, + "epoch": 0.8453075211928095, + "flos": 21870029589120.0, + "grad_norm": 1.8017801915958485, + "language_loss": 0.74459016, + "learning_rate": 2.456781626228128e-07, + "loss": 0.7667774, + "num_input_tokens_seen": 151951715, + "step": 7030, + "time_per_iteration": 3.5652832984924316 + }, + { + "auxiliary_loss_clip": 0.01079576, + "auxiliary_loss_mlp": 0.00873, + "balance_loss_clip": 1.01759243, + "balance_loss_mlp": 1.00138605, + "epoch": 0.8454277640834486, + "flos": 58751869288320.0, + "grad_norm": 0.916066695891638, + "language_loss": 0.66346216, + "learning_rate": 2.453042362024675e-07, + "loss": 0.68298793, + "num_input_tokens_seen": 152004960, + "step": 7031, + "time_per_iteration": 3.39558482170105 + }, + { + "auxiliary_loss_clip": 0.01134373, + "auxiliary_loss_mlp": 0.01084474, + "balance_loss_clip": 1.02502012, + "balance_loss_mlp": 1.00450826, + "epoch": 0.8455480069740876, + "flos": 27090076469760.0, + "grad_norm": 1.5297907421957835, + "language_loss": 0.73171371, + "learning_rate": 2.449305759680751e-07, + "loss": 0.75390214, + "num_input_tokens_seen": 152026285, + "step": 7032, + "time_per_iteration": 2.6403937339782715 + }, + { + "auxiliary_loss_clip": 0.01103179, + "auxiliary_loss_mlp": 0.01083643, + "balance_loss_clip": 1.02244914, + "balance_loss_mlp": 1.00358224, + "epoch": 0.8456682498647268, + "flos": 27198670262400.0, + "grad_norm": 2.0033746404041777, + "language_loss": 0.75166488, + "learning_rate": 2.445571819763188e-07, + "loss": 0.77353311, + "num_input_tokens_seen": 152048585, + "step": 7033, + "time_per_iteration": 3.785696268081665 + }, + { + "auxiliary_loss_clip": 0.0113522, + "auxiliary_loss_mlp": 0.01084325, + "balance_loss_clip": 1.02621877, + "balance_loss_mlp": 1.00426412, + "epoch": 0.8457884927553658, + "flos": 20631901737600.0, + "grad_norm": 1.6052388279825796, + "language_loss": 0.58171278, + "learning_rate": 2.4418405428384227e-07, + "loss": 0.60390824, + "num_input_tokens_seen": 152068795, + "step": 7034, + "time_per_iteration": 2.698169469833374 + }, + { + "auxiliary_loss_clip": 0.01133916, + "auxiliary_loss_mlp": 0.00872901, + "balance_loss_clip": 1.0250386, + "balance_loss_mlp": 1.00007164, + "epoch": 0.8459087356460049, + "flos": 15299023259520.0, + "grad_norm": 1.7033299835984557, + "language_loss": 0.71277314, + "learning_rate": 2.4381119294724864e-07, + "loss": 0.73284131, + "num_input_tokens_seen": 152086240, + "step": 7035, + "time_per_iteration": 2.6213226318359375 + }, + { + "auxiliary_loss_clip": 0.01134366, + "auxiliary_loss_mlp": 0.01084516, + "balance_loss_clip": 1.02514029, + "balance_loss_mlp": 1.00440788, + "epoch": 0.846028978536644, + "flos": 18843155326080.0, + "grad_norm": 2.015167638747839, + "language_loss": 0.53842425, + "learning_rate": 2.434385980231004e-07, + "loss": 0.56061304, + "num_input_tokens_seen": 152105080, + "step": 7036, + "time_per_iteration": 3.5435733795166016 + }, + { + "auxiliary_loss_clip": 0.01124893, + "auxiliary_loss_mlp": 0.0108487, + "balance_loss_clip": 1.02442586, + "balance_loss_mlp": 1.00476146, + "epoch": 0.8461492214272831, + "flos": 52661740285440.0, + "grad_norm": 4.900504425471166, + "language_loss": 0.65573001, + "learning_rate": 2.4306626956792043e-07, + "loss": 0.67782766, + "num_input_tokens_seen": 152130025, + "step": 7037, + "time_per_iteration": 2.90552020072937 + }, + { + "auxiliary_loss_clip": 0.01123323, + "auxiliary_loss_mlp": 0.01083453, + "balance_loss_clip": 1.02338159, + "balance_loss_mlp": 1.00329673, + "epoch": 0.8462694643179222, + "flos": 18588405093120.0, + "grad_norm": 1.6755385116646564, + "language_loss": 0.75846541, + "learning_rate": 2.4269420763819017e-07, + "loss": 0.78053319, + "num_input_tokens_seen": 152148070, + "step": 7038, + "time_per_iteration": 2.69355845451355 + }, + { + "auxiliary_loss_clip": 0.01124946, + "auxiliary_loss_mlp": 0.01084099, + "balance_loss_clip": 1.02527118, + "balance_loss_mlp": 1.0040381, + "epoch": 0.8463897072085613, + "flos": 24387080163840.0, + "grad_norm": 2.5932658710276333, + "language_loss": 0.83184916, + "learning_rate": 2.4232241229035223e-07, + "loss": 0.85393959, + "num_input_tokens_seen": 152165825, + "step": 7039, + "time_per_iteration": 2.712982177734375 + }, + { + "auxiliary_loss_clip": 0.01105266, + "auxiliary_loss_mlp": 0.01078895, + "balance_loss_clip": 1.0180254, + "balance_loss_mlp": 0.9999308, + "epoch": 0.8465099500992004, + "flos": 68702140258560.0, + "grad_norm": 0.7538885026929847, + "language_loss": 0.5679338, + "learning_rate": 2.419508835808064e-07, + "loss": 0.58977538, + "num_input_tokens_seen": 152222380, + "step": 7040, + "time_per_iteration": 3.1826012134552 + }, + { + "auxiliary_loss_clip": 0.01114549, + "auxiliary_loss_mlp": 0.01084814, + "balance_loss_clip": 1.02382755, + "balance_loss_mlp": 1.00470495, + "epoch": 0.8466301929898394, + "flos": 13735724561280.0, + "grad_norm": 2.0310499220919236, + "language_loss": 0.63053149, + "learning_rate": 2.415796215659134e-07, + "loss": 0.65252507, + "num_input_tokens_seen": 152239085, + "step": 7041, + "time_per_iteration": 2.6920413970947266 + }, + { + "auxiliary_loss_clip": 0.011094, + "auxiliary_loss_mlp": 0.01085255, + "balance_loss_clip": 1.02376962, + "balance_loss_mlp": 1.00514615, + "epoch": 0.8467504358804786, + "flos": 19241260738560.0, + "grad_norm": 2.000924785276873, + "language_loss": 0.77391696, + "learning_rate": 2.412086263019939e-07, + "loss": 0.79586351, + "num_input_tokens_seen": 152257110, + "step": 7042, + "time_per_iteration": 2.734179735183716 + }, + { + "auxiliary_loss_clip": 0.01135675, + "auxiliary_loss_mlp": 0.01083965, + "balance_loss_clip": 1.02719259, + "balance_loss_mlp": 1.00395155, + "epoch": 0.8468706787711177, + "flos": 21324115710720.0, + "grad_norm": 3.199765697732583, + "language_loss": 0.79933786, + "learning_rate": 2.408378978453276e-07, + "loss": 0.82153428, + "num_input_tokens_seen": 152277230, + "step": 7043, + "time_per_iteration": 2.611926317214966 + }, + { + "auxiliary_loss_clip": 0.01105188, + "auxiliary_loss_mlp": 0.01078838, + "balance_loss_clip": 1.01806962, + "balance_loss_mlp": 0.99987376, + "epoch": 0.8469909216617567, + "flos": 64877439058560.0, + "grad_norm": 0.813587162761252, + "language_loss": 0.63980639, + "learning_rate": 2.404674362521533e-07, + "loss": 0.66164666, + "num_input_tokens_seen": 152335725, + "step": 7044, + "time_per_iteration": 3.119569778442383 + }, + { + "auxiliary_loss_clip": 0.01125168, + "auxiliary_loss_mlp": 0.01084929, + "balance_loss_clip": 1.02579832, + "balance_loss_mlp": 1.00491548, + "epoch": 0.8471111645523959, + "flos": 19280583152640.0, + "grad_norm": 2.179595926197895, + "language_loss": 0.74579036, + "learning_rate": 2.4009724157866997e-07, + "loss": 0.76789135, + "num_input_tokens_seen": 152352785, + "step": 7045, + "time_per_iteration": 2.71370267868042 + }, + { + "auxiliary_loss_clip": 0.01135968, + "auxiliary_loss_mlp": 0.0108418, + "balance_loss_clip": 1.02664065, + "balance_loss_mlp": 1.004071, + "epoch": 0.8472314074430349, + "flos": 22015826893440.0, + "grad_norm": 1.8400700350175831, + "language_loss": 0.76645637, + "learning_rate": 2.3972731388103564e-07, + "loss": 0.78865784, + "num_input_tokens_seen": 152371265, + "step": 7046, + "time_per_iteration": 2.6567485332489014 + }, + { + "auxiliary_loss_clip": 0.01067769, + "auxiliary_loss_mlp": 0.01079028, + "balance_loss_clip": 1.015643, + "balance_loss_mlp": 1.00006342, + "epoch": 0.847351650333674, + "flos": 57882580243200.0, + "grad_norm": 0.803339694734297, + "language_loss": 0.62430894, + "learning_rate": 2.393576532153687e-07, + "loss": 0.64577699, + "num_input_tokens_seen": 152435050, + "step": 7047, + "time_per_iteration": 3.4396004676818848 + }, + { + "auxiliary_loss_clip": 0.01103455, + "auxiliary_loss_mlp": 0.01078795, + "balance_loss_clip": 1.01692986, + "balance_loss_mlp": 0.99983042, + "epoch": 0.8474718932243132, + "flos": 41284238313600.0, + "grad_norm": 0.9312143027043024, + "language_loss": 0.57823056, + "learning_rate": 2.389882596377453e-07, + "loss": 0.60005307, + "num_input_tokens_seen": 152489315, + "step": 7048, + "time_per_iteration": 3.2113823890686035 + }, + { + "auxiliary_loss_clip": 0.01133583, + "auxiliary_loss_mlp": 0.01083147, + "balance_loss_clip": 1.02429843, + "balance_loss_mlp": 1.00299096, + "epoch": 0.8475921361149522, + "flos": 38180906974080.0, + "grad_norm": 1.6844079038583097, + "language_loss": 0.75950229, + "learning_rate": 2.386191332042031e-07, + "loss": 0.78166962, + "num_input_tokens_seen": 152511210, + "step": 7049, + "time_per_iteration": 2.8076183795928955 + }, + { + "auxiliary_loss_clip": 0.0113726, + "auxiliary_loss_mlp": 0.01084004, + "balance_loss_clip": 1.02755392, + "balance_loss_mlp": 1.00384796, + "epoch": 0.8477123790055913, + "flos": 25375054723200.0, + "grad_norm": 1.651553406708728, + "language_loss": 0.72476029, + "learning_rate": 2.3825027397073794e-07, + "loss": 0.74697298, + "num_input_tokens_seen": 152531685, + "step": 7050, + "time_per_iteration": 2.6948816776275635 + }, + { + "auxiliary_loss_clip": 0.0112434, + "auxiliary_loss_mlp": 0.01083231, + "balance_loss_clip": 1.02444756, + "balance_loss_mlp": 1.00321758, + "epoch": 0.8478326218962304, + "flos": 30225185389440.0, + "grad_norm": 2.1543827904521975, + "language_loss": 0.66287136, + "learning_rate": 2.3788168199330515e-07, + "loss": 0.68494701, + "num_input_tokens_seen": 152553245, + "step": 7051, + "time_per_iteration": 2.70320463180542 + }, + { + "auxiliary_loss_clip": 0.01117024, + "auxiliary_loss_mlp": 0.01083969, + "balance_loss_clip": 1.02378178, + "balance_loss_mlp": 1.00386071, + "epoch": 0.8479528647868695, + "flos": 38213800853760.0, + "grad_norm": 1.516647324262476, + "language_loss": 0.72574556, + "learning_rate": 2.3751335732782074e-07, + "loss": 0.74775553, + "num_input_tokens_seen": 152574505, + "step": 7052, + "time_per_iteration": 2.8677308559417725 + }, + { + "auxiliary_loss_clip": 0.01126466, + "auxiliary_loss_mlp": 0.0108406, + "balance_loss_clip": 1.02613461, + "balance_loss_mlp": 1.00399852, + "epoch": 0.8480731076775085, + "flos": 20957790856320.0, + "grad_norm": 1.8084895427821832, + "language_loss": 0.79268169, + "learning_rate": 2.371453000301582e-07, + "loss": 0.81478691, + "num_input_tokens_seen": 152593190, + "step": 7053, + "time_per_iteration": 2.6363790035247803 + }, + { + "auxiliary_loss_clip": 0.01104177, + "auxiliary_loss_mlp": 0.01084085, + "balance_loss_clip": 1.02244067, + "balance_loss_mlp": 1.00402403, + "epoch": 0.8481933505681477, + "flos": 32596510487040.0, + "grad_norm": 1.8309842418302609, + "language_loss": 0.74636835, + "learning_rate": 2.3677751015615222e-07, + "loss": 0.76825094, + "num_input_tokens_seen": 152615265, + "step": 7054, + "time_per_iteration": 2.7768876552581787 + }, + { + "auxiliary_loss_clip": 0.01119242, + "auxiliary_loss_mlp": 0.01084551, + "balance_loss_clip": 1.02547181, + "balance_loss_mlp": 1.00429893, + "epoch": 0.8483135934587868, + "flos": 20741177888640.0, + "grad_norm": 1.671392023425049, + "language_loss": 0.85074931, + "learning_rate": 2.3640998776159593e-07, + "loss": 0.87278724, + "num_input_tokens_seen": 152632770, + "step": 7055, + "time_per_iteration": 3.6350302696228027 + }, + { + "auxiliary_loss_clip": 0.01115817, + "auxiliary_loss_mlp": 0.01083403, + "balance_loss_clip": 1.02451849, + "balance_loss_mlp": 1.00348496, + "epoch": 0.8484338363494258, + "flos": 21653057485440.0, + "grad_norm": 1.5875565434646055, + "language_loss": 0.81048322, + "learning_rate": 2.3604273290224253e-07, + "loss": 0.83247542, + "num_input_tokens_seen": 152653485, + "step": 7056, + "time_per_iteration": 3.730238437652588 + }, + { + "auxiliary_loss_clip": 0.01114921, + "auxiliary_loss_mlp": 0.01084581, + "balance_loss_clip": 1.02454686, + "balance_loss_mlp": 1.00442481, + "epoch": 0.848554079240065, + "flos": 15013964926080.0, + "grad_norm": 1.8797482452491996, + "language_loss": 0.74559426, + "learning_rate": 2.356757456338039e-07, + "loss": 0.76758927, + "num_input_tokens_seen": 152670970, + "step": 7057, + "time_per_iteration": 2.6506950855255127 + }, + { + "auxiliary_loss_clip": 0.01092962, + "auxiliary_loss_mlp": 0.01079069, + "balance_loss_clip": 1.01412308, + "balance_loss_mlp": 1.00010478, + "epoch": 0.848674322130704, + "flos": 68060453742720.0, + "grad_norm": 0.7461139689549793, + "language_loss": 0.59086168, + "learning_rate": 2.3530902601195147e-07, + "loss": 0.61258197, + "num_input_tokens_seen": 152739460, + "step": 7058, + "time_per_iteration": 3.3702733516693115 + }, + { + "auxiliary_loss_clip": 0.01119585, + "auxiliary_loss_mlp": 0.01084358, + "balance_loss_clip": 1.02582407, + "balance_loss_mlp": 1.00415373, + "epoch": 0.8487945650213431, + "flos": 18475788977280.0, + "grad_norm": 3.1429752814115623, + "language_loss": 0.7881608, + "learning_rate": 2.34942574092317e-07, + "loss": 0.81020027, + "num_input_tokens_seen": 152754710, + "step": 7059, + "time_per_iteration": 3.5865561962127686 + }, + { + "auxiliary_loss_clip": 0.0112632, + "auxiliary_loss_mlp": 0.01084675, + "balance_loss_clip": 1.02503061, + "balance_loss_mlp": 1.00456643, + "epoch": 0.8489148079119821, + "flos": 23473189405440.0, + "grad_norm": 1.9053202224195969, + "language_loss": 0.76885521, + "learning_rate": 2.3457638993049045e-07, + "loss": 0.79096514, + "num_input_tokens_seen": 152772700, + "step": 7060, + "time_per_iteration": 2.68532657623291 + }, + { + "auxiliary_loss_clip": 0.0108432, + "auxiliary_loss_mlp": 0.01083606, + "balance_loss_clip": 1.02058291, + "balance_loss_mlp": 1.00340176, + "epoch": 0.8490350508026213, + "flos": 19937604775680.0, + "grad_norm": 1.7844906741367592, + "language_loss": 0.64305818, + "learning_rate": 2.3421047358202252e-07, + "loss": 0.66473746, + "num_input_tokens_seen": 152791550, + "step": 7061, + "time_per_iteration": 3.6676502227783203 + }, + { + "auxiliary_loss_clip": 0.01126581, + "auxiliary_loss_mlp": 0.01084054, + "balance_loss_clip": 1.02616382, + "balance_loss_mlp": 1.00394559, + "epoch": 0.8491552936932604, + "flos": 24279958828800.0, + "grad_norm": 2.216310968821085, + "language_loss": 0.83296835, + "learning_rate": 2.3384482510242144e-07, + "loss": 0.85507464, + "num_input_tokens_seen": 152809410, + "step": 7062, + "time_per_iteration": 2.6582560539245605 + }, + { + "auxiliary_loss_clip": 0.01133708, + "auxiliary_loss_mlp": 0.0108427, + "balance_loss_clip": 1.02423048, + "balance_loss_mlp": 1.00420952, + "epoch": 0.8492755365838994, + "flos": 22522526098560.0, + "grad_norm": 3.1725884972456244, + "language_loss": 0.77264678, + "learning_rate": 2.3347944454715575e-07, + "loss": 0.79482663, + "num_input_tokens_seen": 152825800, + "step": 7063, + "time_per_iteration": 2.645439386367798 + }, + { + "auxiliary_loss_clip": 0.01135338, + "auxiliary_loss_mlp": 0.01084507, + "balance_loss_clip": 1.02618074, + "balance_loss_mlp": 1.00430322, + "epoch": 0.8493957794745386, + "flos": 26980441182720.0, + "grad_norm": 1.6542345044609112, + "language_loss": 0.67492521, + "learning_rate": 2.331143319716542e-07, + "loss": 0.69712371, + "num_input_tokens_seen": 152845330, + "step": 7064, + "time_per_iteration": 2.6554627418518066 + }, + { + "auxiliary_loss_clip": 0.01085101, + "auxiliary_loss_mlp": 0.01083989, + "balance_loss_clip": 1.02375233, + "balance_loss_mlp": 1.00383282, + "epoch": 0.8495160223651776, + "flos": 29861985018240.0, + "grad_norm": 2.4004344740468544, + "language_loss": 0.65962499, + "learning_rate": 2.3274948743130363e-07, + "loss": 0.6813159, + "num_input_tokens_seen": 152865165, + "step": 7065, + "time_per_iteration": 2.755976676940918 + }, + { + "auxiliary_loss_clip": 0.01135234, + "auxiliary_loss_mlp": 0.01082751, + "balance_loss_clip": 1.02549779, + "balance_loss_mlp": 1.00264251, + "epoch": 0.8496362652558167, + "flos": 23075443128960.0, + "grad_norm": 1.601037289543307, + "language_loss": 0.79532957, + "learning_rate": 2.3238491098145085e-07, + "loss": 0.81750947, + "num_input_tokens_seen": 152884695, + "step": 7066, + "time_per_iteration": 2.701038360595703 + }, + { + "auxiliary_loss_clip": 0.01124967, + "auxiliary_loss_mlp": 0.01084217, + "balance_loss_clip": 1.02494121, + "balance_loss_mlp": 1.0040133, + "epoch": 0.8497565081464559, + "flos": 14609107756800.0, + "grad_norm": 2.274359805705839, + "language_loss": 0.73380089, + "learning_rate": 2.3202060267740141e-07, + "loss": 0.75589275, + "num_input_tokens_seen": 152902220, + "step": 7067, + "time_per_iteration": 2.650585412979126 + }, + { + "auxiliary_loss_clip": 0.01092899, + "auxiliary_loss_mlp": 0.01084245, + "balance_loss_clip": 1.02289891, + "balance_loss_mlp": 1.00408864, + "epoch": 0.8498767510370949, + "flos": 21136446126720.0, + "grad_norm": 2.1012467730725506, + "language_loss": 0.77042794, + "learning_rate": 2.3165656257442044e-07, + "loss": 0.79219937, + "num_input_tokens_seen": 152920740, + "step": 7068, + "time_per_iteration": 2.840819835662842 + }, + { + "auxiliary_loss_clip": 0.01123767, + "auxiliary_loss_mlp": 0.01083721, + "balance_loss_clip": 1.0245012, + "balance_loss_mlp": 1.00365961, + "epoch": 0.849996993927734, + "flos": 23654538195840.0, + "grad_norm": 2.0976999220400527, + "language_loss": 0.89886689, + "learning_rate": 2.31292790727734e-07, + "loss": 0.92094183, + "num_input_tokens_seen": 152938305, + "step": 7069, + "time_per_iteration": 2.634546995162964 + }, + { + "auxiliary_loss_clip": 0.01133178, + "auxiliary_loss_mlp": 0.01083641, + "balance_loss_clip": 1.02426004, + "balance_loss_mlp": 1.00358009, + "epoch": 0.8501172368183731, + "flos": 20558069331840.0, + "grad_norm": 4.078627501951059, + "language_loss": 0.80403942, + "learning_rate": 2.3092928719252392e-07, + "loss": 0.82620764, + "num_input_tokens_seen": 152956705, + "step": 7070, + "time_per_iteration": 2.629603624343872 + }, + { + "auxiliary_loss_clip": 0.01127521, + "auxiliary_loss_mlp": 0.01084608, + "balance_loss_clip": 1.02612591, + "balance_loss_mlp": 1.00454664, + "epoch": 0.8502374797090122, + "flos": 22272624201600.0, + "grad_norm": 1.939034479139522, + "language_loss": 0.78373903, + "learning_rate": 2.3056605202393475e-07, + "loss": 0.80586028, + "num_input_tokens_seen": 152974265, + "step": 7071, + "time_per_iteration": 2.6261179447174072 + }, + { + "auxiliary_loss_clip": 0.01125534, + "auxiliary_loss_mlp": 0.00872964, + "balance_loss_clip": 1.02392602, + "balance_loss_mlp": 1.00006115, + "epoch": 0.8503577225996513, + "flos": 23659817495040.0, + "grad_norm": 1.8458825599595436, + "language_loss": 0.66450429, + "learning_rate": 2.3020308527706888e-07, + "loss": 0.68448925, + "num_input_tokens_seen": 152993680, + "step": 7072, + "time_per_iteration": 2.696993827819824 + }, + { + "auxiliary_loss_clip": 0.01116902, + "auxiliary_loss_mlp": 0.0108452, + "balance_loss_clip": 1.02407038, + "balance_loss_mlp": 1.0044595, + "epoch": 0.8504779654902904, + "flos": 26758513002240.0, + "grad_norm": 1.54526175087435, + "language_loss": 0.88551605, + "learning_rate": 2.2984038700698715e-07, + "loss": 0.90753031, + "num_input_tokens_seen": 153012990, + "step": 7073, + "time_per_iteration": 2.7359259128570557 + }, + { + "auxiliary_loss_clip": 0.01124114, + "auxiliary_loss_mlp": 0.0108488, + "balance_loss_clip": 1.02479267, + "balance_loss_mlp": 1.00472391, + "epoch": 0.8505982083809295, + "flos": 26468247196800.0, + "grad_norm": 1.4894285232506521, + "language_loss": 0.78880715, + "learning_rate": 2.2947795726871222e-07, + "loss": 0.81089705, + "num_input_tokens_seen": 153034015, + "step": 7074, + "time_per_iteration": 2.7464780807495117 + }, + { + "auxiliary_loss_clip": 0.01121071, + "auxiliary_loss_mlp": 0.00872796, + "balance_loss_clip": 1.02331305, + "balance_loss_mlp": 1.00011468, + "epoch": 0.8507184512715685, + "flos": 20303390926080.0, + "grad_norm": 1.8201114999383723, + "language_loss": 0.8569504, + "learning_rate": 2.2911579611722253e-07, + "loss": 0.87688911, + "num_input_tokens_seen": 153053160, + "step": 7075, + "time_per_iteration": 2.622335910797119 + }, + { + "auxiliary_loss_clip": 0.01101886, + "auxiliary_loss_mlp": 0.01084276, + "balance_loss_clip": 1.02608824, + "balance_loss_mlp": 1.00421488, + "epoch": 0.8508386941622077, + "flos": 19025186474880.0, + "grad_norm": 1.6940327249251137, + "language_loss": 0.87362832, + "learning_rate": 2.2875390360745905e-07, + "loss": 0.89548993, + "num_input_tokens_seen": 153072565, + "step": 7076, + "time_per_iteration": 2.733367919921875 + }, + { + "auxiliary_loss_clip": 0.0110185, + "auxiliary_loss_mlp": 0.01084022, + "balance_loss_clip": 1.02290869, + "balance_loss_mlp": 1.00396085, + "epoch": 0.8509589370528468, + "flos": 16433405654400.0, + "grad_norm": 2.075800773802915, + "language_loss": 0.77591252, + "learning_rate": 2.2839227979432008e-07, + "loss": 0.79777122, + "num_input_tokens_seen": 153090215, + "step": 7077, + "time_per_iteration": 2.6910688877105713 + }, + { + "auxiliary_loss_clip": 0.01116117, + "auxiliary_loss_mlp": 0.01083929, + "balance_loss_clip": 1.02429795, + "balance_loss_mlp": 1.00377321, + "epoch": 0.8510791799434858, + "flos": 18259714713600.0, + "grad_norm": 1.7649045857472703, + "language_loss": 0.84907448, + "learning_rate": 2.2803092473266373e-07, + "loss": 0.87107491, + "num_input_tokens_seen": 153107740, + "step": 7078, + "time_per_iteration": 2.7630748748779297 + }, + { + "auxiliary_loss_clip": 0.01137124, + "auxiliary_loss_mlp": 0.01084767, + "balance_loss_clip": 1.02720642, + "balance_loss_mlp": 1.00465834, + "epoch": 0.851199422834125, + "flos": 23441372933760.0, + "grad_norm": 2.3439191141387, + "language_loss": 0.86407292, + "learning_rate": 2.2766983847730724e-07, + "loss": 0.8862918, + "num_input_tokens_seen": 153127410, + "step": 7079, + "time_per_iteration": 2.709150791168213 + }, + { + "auxiliary_loss_clip": 0.0110836, + "auxiliary_loss_mlp": 0.01084499, + "balance_loss_clip": 1.02351379, + "balance_loss_mlp": 1.00443804, + "epoch": 0.851319665724764, + "flos": 16289404030080.0, + "grad_norm": 2.0080979230588945, + "language_loss": 0.66815519, + "learning_rate": 2.2730902108302663e-07, + "loss": 0.69008374, + "num_input_tokens_seen": 153144325, + "step": 7080, + "time_per_iteration": 2.7543509006500244 + }, + { + "auxiliary_loss_clip": 0.01117982, + "auxiliary_loss_mlp": 0.01084581, + "balance_loss_clip": 1.02466059, + "balance_loss_mlp": 1.00447297, + "epoch": 0.8514399086154031, + "flos": 18989347680000.0, + "grad_norm": 1.617862425110742, + "language_loss": 0.68440592, + "learning_rate": 2.269484726045583e-07, + "loss": 0.70643157, + "num_input_tokens_seen": 153163240, + "step": 7081, + "time_per_iteration": 3.610846757888794 + }, + { + "auxiliary_loss_clip": 0.01106947, + "auxiliary_loss_mlp": 0.01083603, + "balance_loss_clip": 1.02412796, + "balance_loss_mlp": 1.00354195, + "epoch": 0.8515601515060423, + "flos": 24571194301440.0, + "grad_norm": 1.7402250230064218, + "language_loss": 0.79318774, + "learning_rate": 2.2658819309659672e-07, + "loss": 0.81509328, + "num_input_tokens_seen": 153183440, + "step": 7082, + "time_per_iteration": 3.6516048908233643 + }, + { + "auxiliary_loss_clip": 0.0111729, + "auxiliary_loss_mlp": 0.01083458, + "balance_loss_clip": 1.02570987, + "balance_loss_mlp": 1.00344515, + "epoch": 0.8516803943966813, + "flos": 19529443555200.0, + "grad_norm": 1.780926566857549, + "language_loss": 0.8452704, + "learning_rate": 2.2622818261379706e-07, + "loss": 0.86727786, + "num_input_tokens_seen": 153200460, + "step": 7083, + "time_per_iteration": 2.7069854736328125 + }, + { + "auxiliary_loss_clip": 0.01117923, + "auxiliary_loss_mlp": 0.01083282, + "balance_loss_clip": 1.02442074, + "balance_loss_mlp": 1.00322056, + "epoch": 0.8518006372873204, + "flos": 20265792364800.0, + "grad_norm": 1.7727141506503252, + "language_loss": 0.74820006, + "learning_rate": 2.2586844121077142e-07, + "loss": 0.77021205, + "num_input_tokens_seen": 153218970, + "step": 7084, + "time_per_iteration": 3.6604480743408203 + }, + { + "auxiliary_loss_clip": 0.01084241, + "auxiliary_loss_mlp": 0.01084814, + "balance_loss_clip": 1.02425623, + "balance_loss_mlp": 1.00465775, + "epoch": 0.8519208801779595, + "flos": 24133227770880.0, + "grad_norm": 1.7847120274838453, + "language_loss": 0.71894002, + "learning_rate": 2.2550896894209215e-07, + "loss": 0.74063051, + "num_input_tokens_seen": 153238485, + "step": 7085, + "time_per_iteration": 2.7781152725219727 + }, + { + "auxiliary_loss_clip": 0.01071736, + "auxiliary_loss_mlp": 0.01078997, + "balance_loss_clip": 1.01774526, + "balance_loss_mlp": 1.00003231, + "epoch": 0.8520411230685986, + "flos": 63035223252480.0, + "grad_norm": 0.6819664578113712, + "language_loss": 0.56651175, + "learning_rate": 2.2514976586229184e-07, + "loss": 0.58801913, + "num_input_tokens_seen": 153306430, + "step": 7086, + "time_per_iteration": 3.4866530895233154 + }, + { + "auxiliary_loss_clip": 0.01105168, + "auxiliary_loss_mlp": 0.01078906, + "balance_loss_clip": 1.01810694, + "balance_loss_mlp": 0.99994147, + "epoch": 0.8521613659592376, + "flos": 65836865283840.0, + "grad_norm": 0.7780187869608675, + "language_loss": 0.54731643, + "learning_rate": 2.247908320258609e-07, + "loss": 0.56915718, + "num_input_tokens_seen": 153366520, + "step": 7087, + "time_per_iteration": 4.10399866104126 + }, + { + "auxiliary_loss_clip": 0.01086182, + "auxiliary_loss_mlp": 0.01083652, + "balance_loss_clip": 1.02090597, + "balance_loss_mlp": 1.00344825, + "epoch": 0.8522816088498768, + "flos": 23112323418240.0, + "grad_norm": 1.8734201263456063, + "language_loss": 0.79541105, + "learning_rate": 2.2443216748724914e-07, + "loss": 0.81710941, + "num_input_tokens_seen": 153387230, + "step": 7088, + "time_per_iteration": 2.8393237590789795 + }, + { + "auxiliary_loss_clip": 0.01126545, + "auxiliary_loss_mlp": 0.00872953, + "balance_loss_clip": 1.02595413, + "balance_loss_mlp": 1.00006974, + "epoch": 0.8524018517405159, + "flos": 31758140073600.0, + "grad_norm": 3.132113879703658, + "language_loss": 0.74527395, + "learning_rate": 2.2407377230086588e-07, + "loss": 0.76526892, + "num_input_tokens_seen": 153409585, + "step": 7089, + "time_per_iteration": 2.7428245544433594 + }, + { + "auxiliary_loss_clip": 0.01101743, + "auxiliary_loss_mlp": 0.01084994, + "balance_loss_clip": 1.02111864, + "balance_loss_mlp": 1.00479031, + "epoch": 0.8525220946311549, + "flos": 18690318956160.0, + "grad_norm": 1.7931792640556479, + "language_loss": 0.83777642, + "learning_rate": 2.23715646521079e-07, + "loss": 0.85964382, + "num_input_tokens_seen": 153427105, + "step": 7090, + "time_per_iteration": 2.767249822616577 + }, + { + "auxiliary_loss_clip": 0.01125528, + "auxiliary_loss_mlp": 0.00872986, + "balance_loss_clip": 1.02445412, + "balance_loss_mlp": 1.00004125, + "epoch": 0.852642337521794, + "flos": 21793216354560.0, + "grad_norm": 1.8643406927148163, + "language_loss": 0.83900547, + "learning_rate": 2.2335779020221724e-07, + "loss": 0.85899067, + "num_input_tokens_seen": 153443725, + "step": 7091, + "time_per_iteration": 2.699514389038086 + }, + { + "auxiliary_loss_clip": 0.01097088, + "auxiliary_loss_mlp": 0.01079494, + "balance_loss_clip": 1.01059008, + "balance_loss_mlp": 1.00053012, + "epoch": 0.8527625804124331, + "flos": 69040132260480.0, + "grad_norm": 0.8016817000212171, + "language_loss": 0.56472725, + "learning_rate": 2.2300020339856497e-07, + "loss": 0.58649307, + "num_input_tokens_seen": 153506410, + "step": 7092, + "time_per_iteration": 3.2652077674865723 + }, + { + "auxiliary_loss_clip": 0.01114274, + "auxiliary_loss_mlp": 0.01083273, + "balance_loss_clip": 1.02331829, + "balance_loss_mlp": 1.00321245, + "epoch": 0.8528828233030722, + "flos": 26979399688320.0, + "grad_norm": 2.116075778531479, + "language_loss": 0.77845931, + "learning_rate": 2.2264288616436966e-07, + "loss": 0.80043483, + "num_input_tokens_seen": 153526665, + "step": 7093, + "time_per_iteration": 2.774913787841797 + }, + { + "auxiliary_loss_clip": 0.0110791, + "auxiliary_loss_mlp": 0.01083955, + "balance_loss_clip": 1.02354288, + "balance_loss_mlp": 1.00370359, + "epoch": 0.8530030661937112, + "flos": 17487598936320.0, + "grad_norm": 1.9556638857956328, + "language_loss": 0.7254132, + "learning_rate": 2.222858385538351e-07, + "loss": 0.74733186, + "num_input_tokens_seen": 153543465, + "step": 7094, + "time_per_iteration": 2.7371346950531006 + }, + { + "auxiliary_loss_clip": 0.01126705, + "auxiliary_loss_mlp": 0.0108416, + "balance_loss_clip": 1.02501154, + "balance_loss_mlp": 1.00414693, + "epoch": 0.8531233090843504, + "flos": 22160798184960.0, + "grad_norm": 2.356664357978651, + "language_loss": 0.67683291, + "learning_rate": 2.2192906062112527e-07, + "loss": 0.69894147, + "num_input_tokens_seen": 153563340, + "step": 7095, + "time_per_iteration": 2.658900499343872 + }, + { + "auxiliary_loss_clip": 0.01136009, + "auxiliary_loss_mlp": 0.01084306, + "balance_loss_clip": 1.02658057, + "balance_loss_mlp": 1.00410247, + "epoch": 0.8532435519749895, + "flos": 37635388145280.0, + "grad_norm": 1.4462504783281982, + "language_loss": 0.70463526, + "learning_rate": 2.2157255242036377e-07, + "loss": 0.72683841, + "num_input_tokens_seen": 153587005, + "step": 7096, + "time_per_iteration": 2.7474708557128906 + }, + { + "auxiliary_loss_clip": 0.01105641, + "auxiliary_loss_mlp": 0.01084436, + "balance_loss_clip": 1.02404714, + "balance_loss_mlp": 1.00432754, + "epoch": 0.8533637948656285, + "flos": 21398163598080.0, + "grad_norm": 1.6166410364764694, + "language_loss": 0.7417087, + "learning_rate": 2.2121631400563135e-07, + "loss": 0.76360947, + "num_input_tokens_seen": 153606835, + "step": 7097, + "time_per_iteration": 2.710860252380371 + }, + { + "auxiliary_loss_clip": 0.01100977, + "auxiliary_loss_mlp": 0.01078989, + "balance_loss_clip": 1.01396835, + "balance_loss_mlp": 1.00002515, + "epoch": 0.8534840377562677, + "flos": 53345122490880.0, + "grad_norm": 0.7550635157388931, + "language_loss": 0.52975821, + "learning_rate": 2.208603454309701e-07, + "loss": 0.55155784, + "num_input_tokens_seen": 153664925, + "step": 7098, + "time_per_iteration": 3.193161725997925 + }, + { + "auxiliary_loss_clip": 0.01096373, + "auxiliary_loss_mlp": 0.01083301, + "balance_loss_clip": 1.02311242, + "balance_loss_mlp": 1.00309658, + "epoch": 0.8536042806469067, + "flos": 20814148368000.0, + "grad_norm": 1.7406738316332275, + "language_loss": 0.70498067, + "learning_rate": 2.2050464675037994e-07, + "loss": 0.72677743, + "num_input_tokens_seen": 153683550, + "step": 7099, + "time_per_iteration": 2.7945892810821533 + }, + { + "auxiliary_loss_clip": 0.01114982, + "auxiliary_loss_mlp": 0.0108479, + "balance_loss_clip": 1.02383685, + "balance_loss_mlp": 1.00458574, + "epoch": 0.8537245235375458, + "flos": 24681368292480.0, + "grad_norm": 2.0002521361354364, + "language_loss": 0.72878796, + "learning_rate": 2.2014921801782016e-07, + "loss": 0.75078571, + "num_input_tokens_seen": 153703040, + "step": 7100, + "time_per_iteration": 2.693586826324463 + }, + { + "auxiliary_loss_clip": 0.01117562, + "auxiliary_loss_mlp": 0.01084399, + "balance_loss_clip": 1.02435422, + "balance_loss_mlp": 1.00429058, + "epoch": 0.853844766428185, + "flos": 24384817607040.0, + "grad_norm": 2.0350097869758046, + "language_loss": 0.73819977, + "learning_rate": 2.1979405928720872e-07, + "loss": 0.7602194, + "num_input_tokens_seen": 153722695, + "step": 7101, + "time_per_iteration": 2.738858461380005 + }, + { + "auxiliary_loss_clip": 0.01118221, + "auxiliary_loss_mlp": 0.0108417, + "balance_loss_clip": 1.0248394, + "balance_loss_mlp": 1.00420475, + "epoch": 0.853965009318824, + "flos": 20955707867520.0, + "grad_norm": 1.4108119467700804, + "language_loss": 0.79546964, + "learning_rate": 2.1943917061242257e-07, + "loss": 0.81749356, + "num_input_tokens_seen": 153742550, + "step": 7102, + "time_per_iteration": 2.71467661857605 + }, + { + "auxiliary_loss_clip": 0.01128382, + "auxiliary_loss_mlp": 0.00872986, + "balance_loss_clip": 1.02655876, + "balance_loss_mlp": 1.00005627, + "epoch": 0.8540852522094631, + "flos": 24201816791040.0, + "grad_norm": 1.6143728388158936, + "language_loss": 0.66194439, + "learning_rate": 2.1908455204729903e-07, + "loss": 0.68195808, + "num_input_tokens_seen": 153761700, + "step": 7103, + "time_per_iteration": 2.715100049972534 + }, + { + "auxiliary_loss_clip": 0.01118829, + "auxiliary_loss_mlp": 0.01083871, + "balance_loss_clip": 1.02544296, + "balance_loss_mlp": 1.00376272, + "epoch": 0.8542054951001022, + "flos": 25082921410560.0, + "grad_norm": 2.111989775765563, + "language_loss": 0.78396928, + "learning_rate": 2.1873020364563265e-07, + "loss": 0.8059963, + "num_input_tokens_seen": 153780765, + "step": 7104, + "time_per_iteration": 2.7354984283447266 + }, + { + "auxiliary_loss_clip": 0.011185, + "auxiliary_loss_mlp": 0.01084238, + "balance_loss_clip": 1.02427948, + "balance_loss_mlp": 1.00417662, + "epoch": 0.8543257379907413, + "flos": 24316551809280.0, + "grad_norm": 2.0893624441337018, + "language_loss": 0.76202381, + "learning_rate": 2.183761254611789e-07, + "loss": 0.78405118, + "num_input_tokens_seen": 153801090, + "step": 7105, + "time_per_iteration": 2.7223353385925293 + }, + { + "auxiliary_loss_clip": 0.01126763, + "auxiliary_loss_mlp": 0.01083439, + "balance_loss_clip": 1.02679276, + "balance_loss_mlp": 1.00333095, + "epoch": 0.8544459808813804, + "flos": 55286630467200.0, + "grad_norm": 2.1728177167689564, + "language_loss": 0.70344114, + "learning_rate": 2.1802231754764987e-07, + "loss": 0.72554314, + "num_input_tokens_seen": 153826530, + "step": 7106, + "time_per_iteration": 3.8327786922454834 + }, + { + "auxiliary_loss_clip": 0.01115387, + "auxiliary_loss_mlp": 0.01083796, + "balance_loss_clip": 1.02388012, + "balance_loss_mlp": 1.00368786, + "epoch": 0.8545662237720195, + "flos": 25776248705280.0, + "grad_norm": 1.8121998719729067, + "language_loss": 0.76795185, + "learning_rate": 2.17668779958718e-07, + "loss": 0.7899437, + "num_input_tokens_seen": 153849110, + "step": 7107, + "time_per_iteration": 2.7382495403289795 + }, + { + "auxiliary_loss_clip": 0.01135329, + "auxiliary_loss_mlp": 0.01083512, + "balance_loss_clip": 1.02641749, + "balance_loss_mlp": 1.00340343, + "epoch": 0.8546864666626586, + "flos": 11108320427520.0, + "grad_norm": 2.1852116131468784, + "language_loss": 0.80548435, + "learning_rate": 2.1731551274801553e-07, + "loss": 0.82767272, + "num_input_tokens_seen": 153865550, + "step": 7108, + "time_per_iteration": 3.5955216884613037 + }, + { + "auxiliary_loss_clip": 0.0111235, + "auxiliary_loss_mlp": 0.01084874, + "balance_loss_clip": 1.02169442, + "balance_loss_mlp": 1.00476587, + "epoch": 0.8548067095532976, + "flos": 25520169669120.0, + "grad_norm": 2.0664205946031857, + "language_loss": 0.61345589, + "learning_rate": 2.169625159691324e-07, + "loss": 0.63542813, + "num_input_tokens_seen": 153885425, + "step": 7109, + "time_per_iteration": 3.708900213241577 + }, + { + "auxiliary_loss_clip": 0.01098317, + "auxiliary_loss_mlp": 0.01083621, + "balance_loss_clip": 1.02286744, + "balance_loss_mlp": 1.00351191, + "epoch": 0.8549269524439368, + "flos": 24717853532160.0, + "grad_norm": 2.02693163173122, + "language_loss": 0.74302375, + "learning_rate": 2.1660978967561784e-07, + "loss": 0.76484311, + "num_input_tokens_seen": 153904760, + "step": 7110, + "time_per_iteration": 2.777540445327759 + }, + { + "auxiliary_loss_clip": 0.01134434, + "auxiliary_loss_mlp": 0.01083705, + "balance_loss_clip": 1.02505851, + "balance_loss_mlp": 1.00350106, + "epoch": 0.8550471953345758, + "flos": 19825599191040.0, + "grad_norm": 4.514283867336992, + "language_loss": 0.78674948, + "learning_rate": 2.1625733392098035e-07, + "loss": 0.80893087, + "num_input_tokens_seen": 153920370, + "step": 7111, + "time_per_iteration": 2.652951955795288 + }, + { + "auxiliary_loss_clip": 0.0113451, + "auxiliary_loss_mlp": 0.01083507, + "balance_loss_clip": 1.02519345, + "balance_loss_mlp": 1.00339818, + "epoch": 0.8551674382252149, + "flos": 22820441500800.0, + "grad_norm": 1.6116676135378984, + "language_loss": 0.79524702, + "learning_rate": 2.159051487586867e-07, + "loss": 0.81742716, + "num_input_tokens_seen": 153940500, + "step": 7112, + "time_per_iteration": 2.667025566101074 + }, + { + "auxiliary_loss_clip": 0.01116408, + "auxiliary_loss_mlp": 0.01083297, + "balance_loss_clip": 1.0248394, + "balance_loss_mlp": 1.00318813, + "epoch": 0.8552876811158541, + "flos": 20631255292800.0, + "grad_norm": 2.11677222282754, + "language_loss": 0.72334754, + "learning_rate": 2.155532342421642e-07, + "loss": 0.74534464, + "num_input_tokens_seen": 153958500, + "step": 7113, + "time_per_iteration": 3.618882417678833 + }, + { + "auxiliary_loss_clip": 0.01128476, + "auxiliary_loss_mlp": 0.01085162, + "balance_loss_clip": 1.02730739, + "balance_loss_mlp": 1.0049578, + "epoch": 0.8554079240064931, + "flos": 23112359331840.0, + "grad_norm": 1.6198086976966326, + "language_loss": 0.78389472, + "learning_rate": 2.1520159042479636e-07, + "loss": 0.80603111, + "num_input_tokens_seen": 153976790, + "step": 7114, + "time_per_iteration": 2.689587354660034 + }, + { + "auxiliary_loss_clip": 0.01125571, + "auxiliary_loss_mlp": 0.01084259, + "balance_loss_clip": 1.02568281, + "balance_loss_mlp": 1.00410247, + "epoch": 0.8555281668971322, + "flos": 22128047959680.0, + "grad_norm": 2.116507706830507, + "language_loss": 0.71348524, + "learning_rate": 2.148502173599287e-07, + "loss": 0.73558354, + "num_input_tokens_seen": 153994930, + "step": 7115, + "time_per_iteration": 2.659327268600464 + }, + { + "auxiliary_loss_clip": 0.01112072, + "auxiliary_loss_mlp": 0.0108342, + "balance_loss_clip": 1.02585888, + "balance_loss_mlp": 1.0033114, + "epoch": 0.8556484097877713, + "flos": 31139040234240.0, + "grad_norm": 1.751166765489867, + "language_loss": 0.65763992, + "learning_rate": 2.1449911510086372e-07, + "loss": 0.67959487, + "num_input_tokens_seen": 154014400, + "step": 7116, + "time_per_iteration": 2.811189889907837 + }, + { + "auxiliary_loss_clip": 0.01124211, + "auxiliary_loss_mlp": 0.0108455, + "balance_loss_clip": 1.0243299, + "balance_loss_mlp": 1.00444174, + "epoch": 0.8557686526784104, + "flos": 24316551809280.0, + "grad_norm": 1.7897853217503856, + "language_loss": 0.76919007, + "learning_rate": 2.141482837008628e-07, + "loss": 0.79127777, + "num_input_tokens_seen": 154034940, + "step": 7117, + "time_per_iteration": 2.703059673309326 + }, + { + "auxiliary_loss_clip": 0.01126743, + "auxiliary_loss_mlp": 0.01084494, + "balance_loss_clip": 1.02563429, + "balance_loss_mlp": 1.00443316, + "epoch": 0.8558888955690495, + "flos": 17712723427200.0, + "grad_norm": 1.9725189094891746, + "language_loss": 0.71851289, + "learning_rate": 2.1379772321314826e-07, + "loss": 0.74062526, + "num_input_tokens_seen": 154052985, + "step": 7118, + "time_per_iteration": 2.6756174564361572 + }, + { + "auxiliary_loss_clip": 0.01081846, + "auxiliary_loss_mlp": 0.01085064, + "balance_loss_clip": 1.01841974, + "balance_loss_mlp": 1.00490808, + "epoch": 0.8560091384596886, + "flos": 19171702051200.0, + "grad_norm": 1.8692689477778928, + "language_loss": 0.81769997, + "learning_rate": 2.1344743369089802e-07, + "loss": 0.839369, + "num_input_tokens_seen": 154068765, + "step": 7119, + "time_per_iteration": 2.987459659576416 + }, + { + "auxiliary_loss_clip": 0.01112092, + "auxiliary_loss_mlp": 0.01084458, + "balance_loss_clip": 1.02207768, + "balance_loss_mlp": 1.0043968, + "epoch": 0.8561293813503277, + "flos": 23914855036800.0, + "grad_norm": 1.6346307229189228, + "language_loss": 0.82002056, + "learning_rate": 2.130974151872522e-07, + "loss": 0.84198606, + "num_input_tokens_seen": 154089100, + "step": 7120, + "time_per_iteration": 2.7913997173309326 + }, + { + "auxiliary_loss_clip": 0.01084339, + "auxiliary_loss_mlp": 0.01083983, + "balance_loss_clip": 1.02042317, + "balance_loss_mlp": 1.00382686, + "epoch": 0.8562496242409667, + "flos": 22529206028160.0, + "grad_norm": 2.40625855895135, + "language_loss": 0.78441226, + "learning_rate": 2.1274766775530773e-07, + "loss": 0.80609554, + "num_input_tokens_seen": 154108965, + "step": 7121, + "time_per_iteration": 2.6949992179870605 + }, + { + "auxiliary_loss_clip": 0.01136168, + "auxiliary_loss_mlp": 0.01084424, + "balance_loss_clip": 1.02598166, + "balance_loss_mlp": 1.00422037, + "epoch": 0.8563698671316058, + "flos": 14712745472640.0, + "grad_norm": 2.614597969353392, + "language_loss": 0.79565966, + "learning_rate": 2.1239819144812077e-07, + "loss": 0.81786561, + "num_input_tokens_seen": 154123425, + "step": 7122, + "time_per_iteration": 2.6070444583892822 + }, + { + "auxiliary_loss_clip": 0.01108345, + "auxiliary_loss_mlp": 0.01084359, + "balance_loss_clip": 1.02458346, + "balance_loss_mlp": 1.00410712, + "epoch": 0.856490110022245, + "flos": 39167768211840.0, + "grad_norm": 1.6698875911773075, + "language_loss": 0.69854999, + "learning_rate": 2.1204898631870716e-07, + "loss": 0.72047704, + "num_input_tokens_seen": 154148315, + "step": 7123, + "time_per_iteration": 2.88098406791687 + }, + { + "auxiliary_loss_clip": 0.01114982, + "auxiliary_loss_mlp": 0.01083906, + "balance_loss_clip": 1.02422118, + "balance_loss_mlp": 1.00384474, + "epoch": 0.856610352912884, + "flos": 29059345658880.0, + "grad_norm": 1.841129526630301, + "language_loss": 0.76123178, + "learning_rate": 2.1170005242004006e-07, + "loss": 0.78322065, + "num_input_tokens_seen": 154169665, + "step": 7124, + "time_per_iteration": 2.761103391647339 + }, + { + "auxiliary_loss_clip": 0.01101508, + "auxiliary_loss_mlp": 0.01083998, + "balance_loss_clip": 1.02513742, + "balance_loss_mlp": 1.00393724, + "epoch": 0.8567305958035231, + "flos": 23878333883520.0, + "grad_norm": 1.7255884050419272, + "language_loss": 0.78084028, + "learning_rate": 2.1135138980505384e-07, + "loss": 0.80269533, + "num_input_tokens_seen": 154190335, + "step": 7125, + "time_per_iteration": 2.824218511581421 + }, + { + "auxiliary_loss_clip": 0.01114589, + "auxiliary_loss_mlp": 0.01083997, + "balance_loss_clip": 1.02383018, + "balance_loss_mlp": 1.00388873, + "epoch": 0.8568508386941622, + "flos": 22200120599040.0, + "grad_norm": 1.7357883347689824, + "language_loss": 0.72263044, + "learning_rate": 2.110029985266395e-07, + "loss": 0.74461627, + "num_input_tokens_seen": 154210040, + "step": 7126, + "time_per_iteration": 2.667316198348999 + }, + { + "auxiliary_loss_clip": 0.01102713, + "auxiliary_loss_mlp": 0.01084327, + "balance_loss_clip": 1.02594924, + "balance_loss_mlp": 1.00421858, + "epoch": 0.8569710815848013, + "flos": 17307507121920.0, + "grad_norm": 1.5582613112932888, + "language_loss": 0.73881233, + "learning_rate": 2.1065487863764787e-07, + "loss": 0.7606827, + "num_input_tokens_seen": 154228385, + "step": 7127, + "time_per_iteration": 2.675884485244751 + }, + { + "auxiliary_loss_clip": 0.01092688, + "auxiliary_loss_mlp": 0.01084199, + "balance_loss_clip": 1.02258432, + "balance_loss_mlp": 1.00404239, + "epoch": 0.8570913244754403, + "flos": 23732285184000.0, + "grad_norm": 1.4044767903690034, + "language_loss": 0.85533643, + "learning_rate": 2.1030703019088846e-07, + "loss": 0.8771053, + "num_input_tokens_seen": 154249015, + "step": 7128, + "time_per_iteration": 2.7764732837677 + }, + { + "auxiliary_loss_clip": 0.01123679, + "auxiliary_loss_mlp": 0.01083501, + "balance_loss_clip": 1.02427328, + "balance_loss_mlp": 1.0033921, + "epoch": 0.8572115673660795, + "flos": 20048748433920.0, + "grad_norm": 2.054767823952241, + "language_loss": 0.70731342, + "learning_rate": 2.099594532391291e-07, + "loss": 0.7293852, + "num_input_tokens_seen": 154267700, + "step": 7129, + "time_per_iteration": 2.673922538757324 + }, + { + "auxiliary_loss_clip": 0.01124371, + "auxiliary_loss_mlp": 0.01084683, + "balance_loss_clip": 1.02350378, + "balance_loss_mlp": 1.00452638, + "epoch": 0.8573318102567186, + "flos": 27160389342720.0, + "grad_norm": 1.566303451309159, + "language_loss": 0.79075217, + "learning_rate": 2.0961214783509806e-07, + "loss": 0.81284273, + "num_input_tokens_seen": 154290580, + "step": 7130, + "time_per_iteration": 2.668152332305908 + }, + { + "auxiliary_loss_clip": 0.01117647, + "auxiliary_loss_mlp": 0.01083577, + "balance_loss_clip": 1.02453423, + "balance_loss_mlp": 1.00342083, + "epoch": 0.8574520531473576, + "flos": 24936585402240.0, + "grad_norm": 1.9626471858153431, + "language_loss": 0.74853683, + "learning_rate": 2.0926511403148051e-07, + "loss": 0.77054906, + "num_input_tokens_seen": 154309545, + "step": 7131, + "time_per_iteration": 2.7997045516967773 + }, + { + "auxiliary_loss_clip": 0.01092937, + "auxiliary_loss_mlp": 0.01085022, + "balance_loss_clip": 1.02572894, + "balance_loss_mlp": 1.0050087, + "epoch": 0.8575722960379968, + "flos": 18771154513920.0, + "grad_norm": 1.841448454008467, + "language_loss": 0.76011741, + "learning_rate": 2.0891835188092143e-07, + "loss": 0.78189695, + "num_input_tokens_seen": 154326545, + "step": 7132, + "time_per_iteration": 3.6606931686401367 + }, + { + "auxiliary_loss_clip": 0.01092431, + "auxiliary_loss_mlp": 0.01084154, + "balance_loss_clip": 1.024526, + "balance_loss_mlp": 1.00404549, + "epoch": 0.8576925389286358, + "flos": 22200300167040.0, + "grad_norm": 2.128058703081124, + "language_loss": 0.81434071, + "learning_rate": 2.0857186143602434e-07, + "loss": 0.83610654, + "num_input_tokens_seen": 154345190, + "step": 7133, + "time_per_iteration": 3.6260721683502197 + }, + { + "auxiliary_loss_clip": 0.01109696, + "auxiliary_loss_mlp": 0.01084584, + "balance_loss_clip": 1.02495193, + "balance_loss_mlp": 1.00442719, + "epoch": 0.8578127818192749, + "flos": 22894345733760.0, + "grad_norm": 1.6802343468678644, + "language_loss": 0.67817587, + "learning_rate": 2.0822564274935094e-07, + "loss": 0.70011866, + "num_input_tokens_seen": 154364615, + "step": 7134, + "time_per_iteration": 3.688156843185425 + }, + { + "auxiliary_loss_clip": 0.01109656, + "auxiliary_loss_mlp": 0.01083446, + "balance_loss_clip": 1.02092171, + "balance_loss_mlp": 1.00328946, + "epoch": 0.8579330247099141, + "flos": 34824839541120.0, + "grad_norm": 1.6934328517568682, + "language_loss": 0.67006171, + "learning_rate": 2.078796958734239e-07, + "loss": 0.69199276, + "num_input_tokens_seen": 154387335, + "step": 7135, + "time_per_iteration": 2.808959722518921 + }, + { + "auxiliary_loss_clip": 0.01124565, + "auxiliary_loss_mlp": 0.01084016, + "balance_loss_clip": 1.02509129, + "balance_loss_mlp": 1.0039072, + "epoch": 0.8580532676005531, + "flos": 19755681367680.0, + "grad_norm": 3.035519502829391, + "language_loss": 0.74713576, + "learning_rate": 2.0753402086072124e-07, + "loss": 0.7692216, + "num_input_tokens_seen": 154405965, + "step": 7136, + "time_per_iteration": 2.671391725540161 + }, + { + "auxiliary_loss_clip": 0.01064281, + "auxiliary_loss_mlp": 0.01084421, + "balance_loss_clip": 1.02334189, + "balance_loss_mlp": 1.00435984, + "epoch": 0.8581735104911922, + "flos": 22739318634240.0, + "grad_norm": 1.8942298871282102, + "language_loss": 0.75545597, + "learning_rate": 2.071886177636828e-07, + "loss": 0.77694303, + "num_input_tokens_seen": 154422750, + "step": 7137, + "time_per_iteration": 2.8955981731414795 + }, + { + "auxiliary_loss_clip": 0.01124988, + "auxiliary_loss_mlp": 0.01085348, + "balance_loss_clip": 1.02548087, + "balance_loss_mlp": 1.005144, + "epoch": 0.8582937533818313, + "flos": 23149131880320.0, + "grad_norm": 1.836273946639321, + "language_loss": 0.83405071, + "learning_rate": 2.0684348663470575e-07, + "loss": 0.85615408, + "num_input_tokens_seen": 154442930, + "step": 7138, + "time_per_iteration": 3.5980300903320312 + }, + { + "auxiliary_loss_clip": 0.01117666, + "auxiliary_loss_mlp": 0.01084697, + "balance_loss_clip": 1.02435637, + "balance_loss_mlp": 1.00458872, + "epoch": 0.8584139962724704, + "flos": 19498668577920.0, + "grad_norm": 1.987815426991031, + "language_loss": 0.61642593, + "learning_rate": 2.0649862752614555e-07, + "loss": 0.63844955, + "num_input_tokens_seen": 154461640, + "step": 7139, + "time_per_iteration": 2.7215795516967773 + }, + { + "auxiliary_loss_clip": 0.01097846, + "auxiliary_loss_mlp": 0.0107862, + "balance_loss_clip": 1.01868749, + "balance_loss_mlp": 0.99965537, + "epoch": 0.8585342391631094, + "flos": 71276577788160.0, + "grad_norm": 0.7502390261697922, + "language_loss": 0.57103467, + "learning_rate": 2.0615404049031838e-07, + "loss": 0.59279931, + "num_input_tokens_seen": 154518610, + "step": 7140, + "time_per_iteration": 3.3062949180603027 + }, + { + "auxiliary_loss_clip": 0.0112445, + "auxiliary_loss_mlp": 0.01084974, + "balance_loss_clip": 1.02501893, + "balance_loss_mlp": 1.0047698, + "epoch": 0.8586544820537486, + "flos": 10815432929280.0, + "grad_norm": 2.3544202480975636, + "language_loss": 0.77896911, + "learning_rate": 2.0580972557949616e-07, + "loss": 0.80106342, + "num_input_tokens_seen": 154533700, + "step": 7141, + "time_per_iteration": 2.682494878768921 + }, + { + "auxiliary_loss_clip": 0.01105116, + "auxiliary_loss_mlp": 0.01079004, + "balance_loss_clip": 1.01776552, + "balance_loss_mlp": 1.00003934, + "epoch": 0.8587747249443877, + "flos": 64811184422400.0, + "grad_norm": 0.793424583365271, + "language_loss": 0.54305339, + "learning_rate": 2.054656828459125e-07, + "loss": 0.56489462, + "num_input_tokens_seen": 154597810, + "step": 7142, + "time_per_iteration": 3.285355806350708 + }, + { + "auxiliary_loss_clip": 0.01098798, + "auxiliary_loss_mlp": 0.01084219, + "balance_loss_clip": 1.02413023, + "balance_loss_mlp": 1.0041101, + "epoch": 0.8588949678350267, + "flos": 26834607964800.0, + "grad_norm": 1.6030673611289665, + "language_loss": 0.77356422, + "learning_rate": 2.051219123417578e-07, + "loss": 0.79539436, + "num_input_tokens_seen": 154617870, + "step": 7143, + "time_per_iteration": 2.9393367767333984 + }, + { + "auxiliary_loss_clip": 0.01135499, + "auxiliary_loss_mlp": 0.01084083, + "balance_loss_clip": 1.02597201, + "balance_loss_mlp": 1.00392663, + "epoch": 0.8590152107256659, + "flos": 26104256726400.0, + "grad_norm": 1.9805482170145767, + "language_loss": 0.60422426, + "learning_rate": 2.0477841411918196e-07, + "loss": 0.62642008, + "num_input_tokens_seen": 154637395, + "step": 7144, + "time_per_iteration": 2.688779354095459 + }, + { + "auxiliary_loss_clip": 0.01122912, + "auxiliary_loss_mlp": 0.01084339, + "balance_loss_clip": 1.02385139, + "balance_loss_mlp": 1.00418234, + "epoch": 0.859135453616305, + "flos": 26140885620480.0, + "grad_norm": 1.765675710493123, + "language_loss": 0.74879551, + "learning_rate": 2.0443518823029326e-07, + "loss": 0.77086806, + "num_input_tokens_seen": 154657935, + "step": 7145, + "time_per_iteration": 2.6715657711029053 + }, + { + "auxiliary_loss_clip": 0.01105985, + "auxiliary_loss_mlp": 0.01083506, + "balance_loss_clip": 1.0230248, + "balance_loss_mlp": 1.00344491, + "epoch": 0.859255696506944, + "flos": 12969319046400.0, + "grad_norm": 2.050398934992989, + "language_loss": 0.76261735, + "learning_rate": 2.0409223472715854e-07, + "loss": 0.78451228, + "num_input_tokens_seen": 154675080, + "step": 7146, + "time_per_iteration": 2.754213571548462 + }, + { + "auxiliary_loss_clip": 0.01105047, + "auxiliary_loss_mlp": 0.0087281, + "balance_loss_clip": 1.02252698, + "balance_loss_mlp": 1.00013292, + "epoch": 0.8593759393975832, + "flos": 18475753063680.0, + "grad_norm": 1.8276993197805296, + "language_loss": 0.75060767, + "learning_rate": 2.0374955366180434e-07, + "loss": 0.77038622, + "num_input_tokens_seen": 154692720, + "step": 7147, + "time_per_iteration": 2.7918992042541504 + }, + { + "auxiliary_loss_clip": 0.01107026, + "auxiliary_loss_mlp": 0.01084674, + "balance_loss_clip": 1.02308202, + "balance_loss_mlp": 1.00451756, + "epoch": 0.8594961822882222, + "flos": 22200156512640.0, + "grad_norm": 1.8772850404966752, + "language_loss": 0.72397101, + "learning_rate": 2.034071450862147e-07, + "loss": 0.74588799, + "num_input_tokens_seen": 154710190, + "step": 7148, + "time_per_iteration": 2.7544162273406982 + }, + { + "auxiliary_loss_clip": 0.01119519, + "auxiliary_loss_mlp": 0.01083574, + "balance_loss_clip": 1.02583265, + "balance_loss_mlp": 1.00346506, + "epoch": 0.8596164251788613, + "flos": 23294749616640.0, + "grad_norm": 1.868666134092312, + "language_loss": 0.76716268, + "learning_rate": 2.030650090523327e-07, + "loss": 0.78919363, + "num_input_tokens_seen": 154729380, + "step": 7149, + "time_per_iteration": 2.74735164642334 + }, + { + "auxiliary_loss_clip": 0.01109884, + "auxiliary_loss_mlp": 0.01085167, + "balance_loss_clip": 1.02566683, + "balance_loss_mlp": 1.00496316, + "epoch": 0.8597366680695004, + "flos": 31649905416960.0, + "grad_norm": 1.8126762152053713, + "language_loss": 0.59606934, + "learning_rate": 2.0272314561205995e-07, + "loss": 0.61801982, + "num_input_tokens_seen": 154749775, + "step": 7150, + "time_per_iteration": 2.8604931831359863 + }, + { + "auxiliary_loss_clip": 0.01106824, + "auxiliary_loss_mlp": 0.01083699, + "balance_loss_clip": 1.02335954, + "balance_loss_mlp": 1.00359082, + "epoch": 0.8598569109601395, + "flos": 21287738211840.0, + "grad_norm": 1.7304381736657632, + "language_loss": 0.72637796, + "learning_rate": 2.023815548172567e-07, + "loss": 0.74828321, + "num_input_tokens_seen": 154769845, + "step": 7151, + "time_per_iteration": 2.8544273376464844 + }, + { + "auxiliary_loss_clip": 0.01127355, + "auxiliary_loss_mlp": 0.01084508, + "balance_loss_clip": 1.02611208, + "balance_loss_mlp": 1.00439942, + "epoch": 0.8599771538507786, + "flos": 25447809720960.0, + "grad_norm": 1.5888036458748804, + "language_loss": 0.66056776, + "learning_rate": 2.0204023671974267e-07, + "loss": 0.68268645, + "num_input_tokens_seen": 154789230, + "step": 7152, + "time_per_iteration": 2.7145493030548096 + }, + { + "auxiliary_loss_clip": 0.01116431, + "auxiliary_loss_mlp": 0.01083779, + "balance_loss_clip": 1.02229333, + "balance_loss_mlp": 1.00371838, + "epoch": 0.8600973967414177, + "flos": 16723958768640.0, + "grad_norm": 3.7971549551529837, + "language_loss": 0.8076148, + "learning_rate": 2.0169919137129532e-07, + "loss": 0.8296169, + "num_input_tokens_seen": 154807670, + "step": 7153, + "time_per_iteration": 2.6393749713897705 + }, + { + "auxiliary_loss_clip": 0.01125018, + "auxiliary_loss_mlp": 0.01084901, + "balance_loss_clip": 1.02538168, + "balance_loss_mlp": 1.00464964, + "epoch": 0.8602176396320568, + "flos": 25227928615680.0, + "grad_norm": 2.192475044027222, + "language_loss": 0.70495337, + "learning_rate": 2.013584188236508e-07, + "loss": 0.72705257, + "num_input_tokens_seen": 154825575, + "step": 7154, + "time_per_iteration": 2.6837306022644043 + }, + { + "auxiliary_loss_clip": 0.01134017, + "auxiliary_loss_mlp": 0.01084143, + "balance_loss_clip": 1.02427602, + "balance_loss_mlp": 1.00398707, + "epoch": 0.8603378825226958, + "flos": 20412236113920.0, + "grad_norm": 1.7616966473769122, + "language_loss": 0.79414856, + "learning_rate": 2.0101791912850396e-07, + "loss": 0.81633019, + "num_input_tokens_seen": 154845115, + "step": 7155, + "time_per_iteration": 2.6278114318847656 + }, + { + "auxiliary_loss_clip": 0.01115861, + "auxiliary_loss_mlp": 0.01083484, + "balance_loss_clip": 1.0247196, + "balance_loss_mlp": 1.00337505, + "epoch": 0.8604581254133349, + "flos": 34930201109760.0, + "grad_norm": 2.2057504030068635, + "language_loss": 0.64360744, + "learning_rate": 2.006776923375082e-07, + "loss": 0.6656009, + "num_input_tokens_seen": 154866770, + "step": 7156, + "time_per_iteration": 2.814239740371704 + }, + { + "auxiliary_loss_clip": 0.01135151, + "auxiliary_loss_mlp": 0.0108396, + "balance_loss_clip": 1.02574742, + "balance_loss_mlp": 1.00389934, + "epoch": 0.860578368303974, + "flos": 22596538072320.0, + "grad_norm": 1.592407880702629, + "language_loss": 0.71204442, + "learning_rate": 2.003377385022764e-07, + "loss": 0.73423553, + "num_input_tokens_seen": 154885595, + "step": 7157, + "time_per_iteration": 2.628483295440674 + }, + { + "auxiliary_loss_clip": 0.01115416, + "auxiliary_loss_mlp": 0.01084456, + "balance_loss_clip": 1.02376091, + "balance_loss_mlp": 1.00439525, + "epoch": 0.8606986111946131, + "flos": 21324331192320.0, + "grad_norm": 1.991201440663348, + "language_loss": 0.77380157, + "learning_rate": 1.9999805767437826e-07, + "loss": 0.79580033, + "num_input_tokens_seen": 154904485, + "step": 7158, + "time_per_iteration": 4.4983274936676025 + }, + { + "auxiliary_loss_clip": 0.01115256, + "auxiliary_loss_mlp": 0.01083296, + "balance_loss_clip": 1.023278, + "balance_loss_mlp": 1.00323486, + "epoch": 0.8608188540852522, + "flos": 28877206769280.0, + "grad_norm": 1.7543818939091116, + "language_loss": 0.71785748, + "learning_rate": 1.9965864990534386e-07, + "loss": 0.73984301, + "num_input_tokens_seen": 154925010, + "step": 7159, + "time_per_iteration": 2.7357635498046875 + }, + { + "auxiliary_loss_clip": 0.01107027, + "auxiliary_loss_mlp": 0.01084619, + "balance_loss_clip": 1.02295554, + "balance_loss_mlp": 1.00446248, + "epoch": 0.8609390969758913, + "flos": 29716187713920.0, + "grad_norm": 1.631169709010417, + "language_loss": 0.77675086, + "learning_rate": 1.9931951524666092e-07, + "loss": 0.79866731, + "num_input_tokens_seen": 154946100, + "step": 7160, + "time_per_iteration": 2.8373212814331055 + }, + { + "auxiliary_loss_clip": 0.01126115, + "auxiliary_loss_mlp": 0.00872774, + "balance_loss_clip": 1.02526951, + "balance_loss_mlp": 1.00007844, + "epoch": 0.8610593398665304, + "flos": 21249349551360.0, + "grad_norm": 1.5546081452035256, + "language_loss": 0.80912179, + "learning_rate": 1.9898065374977534e-07, + "loss": 0.82911074, + "num_input_tokens_seen": 154966305, + "step": 7161, + "time_per_iteration": 3.6477303504943848 + }, + { + "auxiliary_loss_clip": 0.01083821, + "auxiliary_loss_mlp": 0.01083596, + "balance_loss_clip": 1.02316058, + "balance_loss_mlp": 1.00367856, + "epoch": 0.8611795827571694, + "flos": 14830102183680.0, + "grad_norm": 2.2714067692734226, + "language_loss": 0.72996819, + "learning_rate": 1.9864206546609342e-07, + "loss": 0.75164247, + "num_input_tokens_seen": 154985145, + "step": 7162, + "time_per_iteration": 2.707998752593994 + }, + { + "auxiliary_loss_clip": 0.01133604, + "auxiliary_loss_mlp": 0.01084353, + "balance_loss_clip": 1.02438974, + "balance_loss_mlp": 1.00429237, + "epoch": 0.8612998256478086, + "flos": 24243258107520.0, + "grad_norm": 1.885352126062552, + "language_loss": 0.83996934, + "learning_rate": 1.983037504469771e-07, + "loss": 0.86214888, + "num_input_tokens_seen": 155003855, + "step": 7163, + "time_per_iteration": 3.612274169921875 + }, + { + "auxiliary_loss_clip": 0.01125863, + "auxiliary_loss_mlp": 0.01084151, + "balance_loss_clip": 1.02513027, + "balance_loss_mlp": 1.00409031, + "epoch": 0.8614200685384477, + "flos": 21252653602560.0, + "grad_norm": 1.6749385976648075, + "language_loss": 0.66660923, + "learning_rate": 1.9796570874374984e-07, + "loss": 0.68870944, + "num_input_tokens_seen": 155023960, + "step": 7164, + "time_per_iteration": 2.701155424118042 + }, + { + "auxiliary_loss_clip": 0.01115687, + "auxiliary_loss_mlp": 0.01083373, + "balance_loss_clip": 1.02369833, + "balance_loss_mlp": 1.00326395, + "epoch": 0.8615403114290867, + "flos": 20007738080640.0, + "grad_norm": 1.5892380594015045, + "language_loss": 0.7744441, + "learning_rate": 1.976279404076917e-07, + "loss": 0.7964347, + "num_input_tokens_seen": 155043360, + "step": 7165, + "time_per_iteration": 2.718656301498413 + }, + { + "auxiliary_loss_clip": 0.01105813, + "auxiliary_loss_mlp": 0.01083981, + "balance_loss_clip": 1.0240612, + "balance_loss_mlp": 1.00377655, + "epoch": 0.8616605543197259, + "flos": 29789373674880.0, + "grad_norm": 1.8443132802776174, + "language_loss": 0.75955606, + "learning_rate": 1.9729044549004193e-07, + "loss": 0.78145403, + "num_input_tokens_seen": 155064745, + "step": 7166, + "time_per_iteration": 2.7979466915130615 + }, + { + "auxiliary_loss_clip": 0.0112437, + "auxiliary_loss_mlp": 0.01084652, + "balance_loss_clip": 1.02522683, + "balance_loss_mlp": 1.00454378, + "epoch": 0.8617807972103649, + "flos": 28911609020160.0, + "grad_norm": 1.7409553971893614, + "language_loss": 0.70520115, + "learning_rate": 1.9695322404199822e-07, + "loss": 0.72729135, + "num_input_tokens_seen": 155086790, + "step": 7167, + "time_per_iteration": 2.7009963989257812 + }, + { + "auxiliary_loss_clip": 0.01110239, + "auxiliary_loss_mlp": 0.01084121, + "balance_loss_clip": 1.02055073, + "balance_loss_mlp": 1.00401223, + "epoch": 0.861901040101004, + "flos": 27673804391040.0, + "grad_norm": 2.1298751810932472, + "language_loss": 0.82323718, + "learning_rate": 1.9661627611471654e-07, + "loss": 0.84518075, + "num_input_tokens_seen": 155106585, + "step": 7168, + "time_per_iteration": 2.7975244522094727 + }, + { + "auxiliary_loss_clip": 0.01099383, + "auxiliary_loss_mlp": 0.01083892, + "balance_loss_clip": 1.02362514, + "balance_loss_mlp": 1.00373602, + "epoch": 0.8620212829916432, + "flos": 49748056755840.0, + "grad_norm": 1.9887662472931107, + "language_loss": 0.69937658, + "learning_rate": 1.9627960175931246e-07, + "loss": 0.72120935, + "num_input_tokens_seen": 155131285, + "step": 7169, + "time_per_iteration": 2.896158456802368 + }, + { + "auxiliary_loss_clip": 0.0112572, + "auxiliary_loss_mlp": 0.01084018, + "balance_loss_clip": 1.02562106, + "balance_loss_mlp": 1.00390983, + "epoch": 0.8621415258822822, + "flos": 21138672769920.0, + "grad_norm": 2.1262659470494714, + "language_loss": 0.74067652, + "learning_rate": 1.9594320102685847e-07, + "loss": 0.76277399, + "num_input_tokens_seen": 155150555, + "step": 7170, + "time_per_iteration": 2.66670560836792 + }, + { + "auxiliary_loss_clip": 0.01115109, + "auxiliary_loss_mlp": 0.0087291, + "balance_loss_clip": 1.02388477, + "balance_loss_mlp": 1.00006986, + "epoch": 0.8622617687729213, + "flos": 21689039934720.0, + "grad_norm": 2.023637397210446, + "language_loss": 0.64336652, + "learning_rate": 1.956070739683864e-07, + "loss": 0.66324675, + "num_input_tokens_seen": 155169890, + "step": 7171, + "time_per_iteration": 2.671109914779663 + }, + { + "auxiliary_loss_clip": 0.0110587, + "auxiliary_loss_mlp": 0.01084133, + "balance_loss_clip": 1.02244473, + "balance_loss_mlp": 1.00407171, + "epoch": 0.8623820116635604, + "flos": 26250592734720.0, + "grad_norm": 1.432888133395662, + "language_loss": 0.74211609, + "learning_rate": 1.9527122063488678e-07, + "loss": 0.76401615, + "num_input_tokens_seen": 155191005, + "step": 7172, + "time_per_iteration": 2.805375099182129 + }, + { + "auxiliary_loss_clip": 0.01117734, + "auxiliary_loss_mlp": 0.01083289, + "balance_loss_clip": 1.02464628, + "balance_loss_mlp": 1.00322819, + "epoch": 0.8625022545541995, + "flos": 19647554451840.0, + "grad_norm": 1.5945882926275865, + "language_loss": 0.80243862, + "learning_rate": 1.9493564107730755e-07, + "loss": 0.82444882, + "num_input_tokens_seen": 155211005, + "step": 7173, + "time_per_iteration": 2.7702906131744385 + }, + { + "auxiliary_loss_clip": 0.01117819, + "auxiliary_loss_mlp": 0.01083638, + "balance_loss_clip": 1.02533627, + "balance_loss_mlp": 1.00362432, + "epoch": 0.8626224974448385, + "flos": 21908382336000.0, + "grad_norm": 2.067273409397042, + "language_loss": 0.60751969, + "learning_rate": 1.9460033534655684e-07, + "loss": 0.62953424, + "num_input_tokens_seen": 155230365, + "step": 7174, + "time_per_iteration": 2.720867395401001 + }, + { + "auxiliary_loss_clip": 0.01116609, + "auxiliary_loss_mlp": 0.01083375, + "balance_loss_clip": 1.02372038, + "balance_loss_mlp": 1.00326633, + "epoch": 0.8627427403354777, + "flos": 23331198942720.0, + "grad_norm": 1.5547039206690283, + "language_loss": 0.84067106, + "learning_rate": 1.9426530349349978e-07, + "loss": 0.8626709, + "num_input_tokens_seen": 155250815, + "step": 7175, + "time_per_iteration": 2.7900238037109375 + }, + { + "auxiliary_loss_clip": 0.01127082, + "auxiliary_loss_mlp": 0.00872802, + "balance_loss_clip": 1.0266093, + "balance_loss_mlp": 1.00009191, + "epoch": 0.8628629832261168, + "flos": 16362877299840.0, + "grad_norm": 1.6940947553979766, + "language_loss": 0.65016711, + "learning_rate": 1.9393054556896038e-07, + "loss": 0.6701659, + "num_input_tokens_seen": 155268515, + "step": 7176, + "time_per_iteration": 2.7020528316497803 + }, + { + "auxiliary_loss_clip": 0.01090132, + "auxiliary_loss_mlp": 0.01083701, + "balance_loss_clip": 1.02320838, + "balance_loss_mlp": 1.0036397, + "epoch": 0.8629832261167558, + "flos": 28103941756800.0, + "grad_norm": 2.6828058887248942, + "language_loss": 0.69081092, + "learning_rate": 1.9359606162372133e-07, + "loss": 0.71254933, + "num_input_tokens_seen": 155290120, + "step": 7177, + "time_per_iteration": 2.866913318634033 + }, + { + "auxiliary_loss_clip": 0.0113587, + "auxiliary_loss_mlp": 0.01083098, + "balance_loss_clip": 1.02685571, + "balance_loss_mlp": 1.00298941, + "epoch": 0.863103469007395, + "flos": 20230061310720.0, + "grad_norm": 1.7201719100357586, + "language_loss": 0.70522612, + "learning_rate": 1.9326185170852293e-07, + "loss": 0.7274158, + "num_input_tokens_seen": 155309085, + "step": 7178, + "time_per_iteration": 2.6366240978240967 + }, + { + "auxiliary_loss_clip": 0.01127061, + "auxiliary_loss_mlp": 0.01084326, + "balance_loss_clip": 1.02589488, + "balance_loss_mlp": 1.0041225, + "epoch": 0.863223711898034, + "flos": 24498547044480.0, + "grad_norm": 1.8299226565371298, + "language_loss": 0.72092366, + "learning_rate": 1.9292791587406598e-07, + "loss": 0.74303758, + "num_input_tokens_seen": 155327945, + "step": 7179, + "time_per_iteration": 2.6835310459136963 + }, + { + "auxiliary_loss_clip": 0.0112678, + "auxiliary_loss_mlp": 0.00872875, + "balance_loss_clip": 1.02534902, + "balance_loss_mlp": 1.00008941, + "epoch": 0.8633439547886731, + "flos": 17675376261120.0, + "grad_norm": 2.111192657586083, + "language_loss": 0.86798024, + "learning_rate": 1.9259425417100661e-07, + "loss": 0.88797677, + "num_input_tokens_seen": 155344060, + "step": 7180, + "time_per_iteration": 2.6544840335845947 + }, + { + "auxiliary_loss_clip": 0.0109241, + "auxiliary_loss_mlp": 0.01084514, + "balance_loss_clip": 1.02345586, + "balance_loss_mlp": 1.00435805, + "epoch": 0.8634641976793123, + "flos": 12895055677440.0, + "grad_norm": 2.004255950536035, + "language_loss": 0.74860239, + "learning_rate": 1.9226086664996234e-07, + "loss": 0.77037156, + "num_input_tokens_seen": 155362305, + "step": 7181, + "time_per_iteration": 2.827472686767578 + }, + { + "auxiliary_loss_clip": 0.01099398, + "auxiliary_loss_mlp": 0.01084955, + "balance_loss_clip": 1.02501106, + "balance_loss_mlp": 1.0048461, + "epoch": 0.8635844405699513, + "flos": 23878980328320.0, + "grad_norm": 2.1046604409623173, + "language_loss": 0.74243319, + "learning_rate": 1.9192775336150712e-07, + "loss": 0.76427668, + "num_input_tokens_seen": 155382605, + "step": 7182, + "time_per_iteration": 2.6729624271392822 + }, + { + "auxiliary_loss_clip": 0.01104136, + "auxiliary_loss_mlp": 0.01079282, + "balance_loss_clip": 1.01742172, + "balance_loss_mlp": 1.00031805, + "epoch": 0.8637046834605904, + "flos": 60453387521280.0, + "grad_norm": 0.7918261486630978, + "language_loss": 0.56316912, + "learning_rate": 1.915949143561739e-07, + "loss": 0.58500338, + "num_input_tokens_seen": 155437280, + "step": 7183, + "time_per_iteration": 5.0474748611450195 + }, + { + "auxiliary_loss_clip": 0.01125969, + "auxiliary_loss_mlp": 0.01083909, + "balance_loss_clip": 1.02659106, + "balance_loss_mlp": 1.00375247, + "epoch": 0.8638249263512295, + "flos": 20558751690240.0, + "grad_norm": 1.6077288614984402, + "language_loss": 0.78089905, + "learning_rate": 1.9126234968445498e-07, + "loss": 0.80299783, + "num_input_tokens_seen": 155456970, + "step": 7184, + "time_per_iteration": 2.672529458999634 + }, + { + "auxiliary_loss_clip": 0.01134801, + "auxiliary_loss_mlp": 0.01083517, + "balance_loss_clip": 1.02542579, + "balance_loss_mlp": 1.00340819, + "epoch": 0.8639451692418686, + "flos": 26615768353920.0, + "grad_norm": 1.3236063585638664, + "language_loss": 0.67560011, + "learning_rate": 1.9093005939679884e-07, + "loss": 0.69778335, + "num_input_tokens_seen": 155478925, + "step": 7185, + "time_per_iteration": 2.758085250854492 + }, + { + "auxiliary_loss_clip": 0.01124801, + "auxiliary_loss_mlp": 0.01083427, + "balance_loss_clip": 1.02494168, + "balance_loss_mlp": 1.00341439, + "epoch": 0.8640654121325076, + "flos": 15122450977920.0, + "grad_norm": 1.773338613395958, + "language_loss": 0.76528549, + "learning_rate": 1.9059804354361452e-07, + "loss": 0.78736782, + "num_input_tokens_seen": 155496700, + "step": 7186, + "time_per_iteration": 3.4916577339172363 + }, + { + "auxiliary_loss_clip": 0.0111802, + "auxiliary_loss_mlp": 0.0108412, + "balance_loss_clip": 1.0244422, + "balance_loss_mlp": 1.00396335, + "epoch": 0.8641856550231467, + "flos": 31869068250240.0, + "grad_norm": 1.5651835739959257, + "language_loss": 0.70487517, + "learning_rate": 1.902663021752684e-07, + "loss": 0.72689664, + "num_input_tokens_seen": 155518130, + "step": 7187, + "time_per_iteration": 2.837376594543457 + }, + { + "auxiliary_loss_clip": 0.01135518, + "auxiliary_loss_mlp": 0.01084351, + "balance_loss_clip": 1.0260632, + "balance_loss_mlp": 1.00419414, + "epoch": 0.8643058979137859, + "flos": 14976545932800.0, + "grad_norm": 2.0252018126493354, + "language_loss": 0.82287526, + "learning_rate": 1.8993483534208556e-07, + "loss": 0.84507388, + "num_input_tokens_seen": 155537040, + "step": 7188, + "time_per_iteration": 2.585866928100586 + }, + { + "auxiliary_loss_clip": 0.01111605, + "auxiliary_loss_mlp": 0.01085083, + "balance_loss_clip": 1.0208962, + "balance_loss_mlp": 1.0049262, + "epoch": 0.8644261408044249, + "flos": 13115726881920.0, + "grad_norm": 2.3915165550030304, + "language_loss": 0.75167799, + "learning_rate": 1.8960364309434884e-07, + "loss": 0.77364492, + "num_input_tokens_seen": 155554535, + "step": 7189, + "time_per_iteration": 3.655003547668457 + }, + { + "auxiliary_loss_clip": 0.01087035, + "auxiliary_loss_mlp": 0.00872774, + "balance_loss_clip": 1.02168953, + "balance_loss_mlp": 1.00010991, + "epoch": 0.864546383695064, + "flos": 20850920916480.0, + "grad_norm": 1.6981921452214424, + "language_loss": 0.78304207, + "learning_rate": 1.8927272548229967e-07, + "loss": 0.8026402, + "num_input_tokens_seen": 155574225, + "step": 7190, + "time_per_iteration": 2.87777042388916 + }, + { + "auxiliary_loss_clip": 0.01098454, + "auxiliary_loss_mlp": 0.01083755, + "balance_loss_clip": 1.0235858, + "balance_loss_mlp": 1.00369418, + "epoch": 0.8646666265857031, + "flos": 21324582587520.0, + "grad_norm": 1.6439169704056602, + "language_loss": 0.83170855, + "learning_rate": 1.8894208255613876e-07, + "loss": 0.8535307, + "num_input_tokens_seen": 155593540, + "step": 7191, + "time_per_iteration": 2.7815589904785156 + }, + { + "auxiliary_loss_clip": 0.011347, + "auxiliary_loss_mlp": 0.01083554, + "balance_loss_clip": 1.02535033, + "balance_loss_mlp": 1.00363612, + "epoch": 0.8647868694763422, + "flos": 19750833031680.0, + "grad_norm": 2.0635641577507298, + "language_loss": 0.77625203, + "learning_rate": 1.8861171436602397e-07, + "loss": 0.79843456, + "num_input_tokens_seen": 155610655, + "step": 7192, + "time_per_iteration": 2.6168150901794434 + }, + { + "auxiliary_loss_clip": 0.01126007, + "auxiliary_loss_mlp": 0.01084348, + "balance_loss_clip": 1.02599943, + "balance_loss_mlp": 1.00428677, + "epoch": 0.8649071123669813, + "flos": 26176760328960.0, + "grad_norm": 2.263282154681203, + "language_loss": 0.79983306, + "learning_rate": 1.882816209620719e-07, + "loss": 0.82193661, + "num_input_tokens_seen": 155627365, + "step": 7193, + "time_per_iteration": 2.6648519039154053 + }, + { + "auxiliary_loss_clip": 0.01112896, + "auxiliary_loss_mlp": 0.01085683, + "balance_loss_clip": 1.02245378, + "balance_loss_mlp": 1.00552678, + "epoch": 0.8650273552576204, + "flos": 20302888135680.0, + "grad_norm": 1.8467544369913593, + "language_loss": 0.76846504, + "learning_rate": 1.8795180239435738e-07, + "loss": 0.79045081, + "num_input_tokens_seen": 155646220, + "step": 7194, + "time_per_iteration": 2.711411952972412 + }, + { + "auxiliary_loss_clip": 0.01117289, + "auxiliary_loss_mlp": 0.01085278, + "balance_loss_clip": 1.02450097, + "balance_loss_mlp": 1.00516891, + "epoch": 0.8651475981482595, + "flos": 23951088881280.0, + "grad_norm": 2.600444093631547, + "language_loss": 0.76075256, + "learning_rate": 1.8762225871291348e-07, + "loss": 0.78277826, + "num_input_tokens_seen": 155662095, + "step": 7195, + "time_per_iteration": 2.727273464202881 + }, + { + "auxiliary_loss_clip": 0.01134695, + "auxiliary_loss_mlp": 0.00872914, + "balance_loss_clip": 1.02561998, + "balance_loss_mlp": 1.0000906, + "epoch": 0.8652678410388985, + "flos": 21684622561920.0, + "grad_norm": 1.617337917143809, + "language_loss": 0.80992329, + "learning_rate": 1.8729298996773201e-07, + "loss": 0.82999939, + "num_input_tokens_seen": 155680845, + "step": 7196, + "time_per_iteration": 2.6705057621002197 + }, + { + "auxiliary_loss_clip": 0.01106911, + "auxiliary_loss_mlp": 0.01079038, + "balance_loss_clip": 1.02041948, + "balance_loss_mlp": 1.00007331, + "epoch": 0.8653880839295377, + "flos": 65224660855680.0, + "grad_norm": 0.832930624818606, + "language_loss": 0.61008281, + "learning_rate": 1.8696399620876301e-07, + "loss": 0.63194227, + "num_input_tokens_seen": 155737875, + "step": 7197, + "time_per_iteration": 3.2227797508239746 + }, + { + "auxiliary_loss_clip": 0.01110819, + "auxiliary_loss_mlp": 0.0108428, + "balance_loss_clip": 1.0251677, + "balance_loss_mlp": 1.00417113, + "epoch": 0.8655083268201768, + "flos": 17749172753280.0, + "grad_norm": 1.884137269550784, + "language_loss": 0.79035515, + "learning_rate": 1.866352774859141e-07, + "loss": 0.81230617, + "num_input_tokens_seen": 155753100, + "step": 7198, + "time_per_iteration": 2.7112951278686523 + }, + { + "auxiliary_loss_clip": 0.01108088, + "auxiliary_loss_mlp": 0.0108361, + "balance_loss_clip": 1.02366996, + "balance_loss_mlp": 1.00354934, + "epoch": 0.8656285697108158, + "flos": 20703974376960.0, + "grad_norm": 2.0964716944653734, + "language_loss": 0.69207132, + "learning_rate": 1.8630683384905188e-07, + "loss": 0.7139883, + "num_input_tokens_seen": 155772430, + "step": 7199, + "time_per_iteration": 2.8032915592193604 + }, + { + "auxiliary_loss_clip": 0.01134054, + "auxiliary_loss_mlp": 0.00873003, + "balance_loss_clip": 1.02474546, + "balance_loss_mlp": 1.00004292, + "epoch": 0.865748812601455, + "flos": 18653833716480.0, + "grad_norm": 1.7247754755974265, + "language_loss": 0.88464952, + "learning_rate": 1.8597866534800045e-07, + "loss": 0.90472007, + "num_input_tokens_seen": 155787545, + "step": 7200, + "time_per_iteration": 2.58984375 + }, + { + "auxiliary_loss_clip": 0.01126397, + "auxiliary_loss_mlp": 0.00872886, + "balance_loss_clip": 1.02563703, + "balance_loss_mlp": 1.00004375, + "epoch": 0.865869055492094, + "flos": 70652554807680.0, + "grad_norm": 1.683361347088334, + "language_loss": 0.7418502, + "learning_rate": 1.8565077203254398e-07, + "loss": 0.76184303, + "num_input_tokens_seen": 155813005, + "step": 7201, + "time_per_iteration": 3.084088087081909 + }, + { + "auxiliary_loss_clip": 0.01106297, + "auxiliary_loss_mlp": 0.01083627, + "balance_loss_clip": 1.02476108, + "balance_loss_mlp": 1.00347066, + "epoch": 0.8659892983827331, + "flos": 17383961220480.0, + "grad_norm": 2.3092098960001013, + "language_loss": 0.72681755, + "learning_rate": 1.8532315395242203e-07, + "loss": 0.74871683, + "num_input_tokens_seen": 155829455, + "step": 7202, + "time_per_iteration": 2.6927802562713623 + }, + { + "auxiliary_loss_clip": 0.01106198, + "auxiliary_loss_mlp": 0.0108401, + "balance_loss_clip": 1.02278078, + "balance_loss_mlp": 1.00399637, + "epoch": 0.8661095412733723, + "flos": 17895221452800.0, + "grad_norm": 2.1109101565448594, + "language_loss": 0.72017962, + "learning_rate": 1.849958111573353e-07, + "loss": 0.74208164, + "num_input_tokens_seen": 155848060, + "step": 7203, + "time_per_iteration": 2.778364896774292 + }, + { + "auxiliary_loss_clip": 0.01134984, + "auxiliary_loss_mlp": 0.01083481, + "balance_loss_clip": 1.02606654, + "balance_loss_mlp": 1.00332499, + "epoch": 0.8662297841640113, + "flos": 18224163227520.0, + "grad_norm": 3.334388444228375, + "language_loss": 0.64037359, + "learning_rate": 1.8466874369694074e-07, + "loss": 0.66255826, + "num_input_tokens_seen": 155865755, + "step": 7204, + "time_per_iteration": 2.6022748947143555 + }, + { + "auxiliary_loss_clip": 0.01109661, + "auxiliary_loss_mlp": 0.01085128, + "balance_loss_clip": 1.02452159, + "balance_loss_mlp": 1.00501883, + "epoch": 0.8663500270546504, + "flos": 16362159027840.0, + "grad_norm": 2.792533283641629, + "language_loss": 0.70490921, + "learning_rate": 1.843419516208542e-07, + "loss": 0.72685713, + "num_input_tokens_seen": 155882680, + "step": 7205, + "time_per_iteration": 2.776017904281616 + }, + { + "auxiliary_loss_clip": 0.01127564, + "auxiliary_loss_mlp": 0.01084345, + "balance_loss_clip": 1.02686882, + "balance_loss_mlp": 1.0042367, + "epoch": 0.8664702699452895, + "flos": 17894431353600.0, + "grad_norm": 2.718554899702216, + "language_loss": 0.79549301, + "learning_rate": 1.8401543497865047e-07, + "loss": 0.81761217, + "num_input_tokens_seen": 155900680, + "step": 7206, + "time_per_iteration": 2.6864867210388184 + }, + { + "auxiliary_loss_clip": 0.01124983, + "auxiliary_loss_mlp": 0.00872817, + "balance_loss_clip": 1.02396405, + "balance_loss_mlp": 1.00010288, + "epoch": 0.8665905128359286, + "flos": 30736373794560.0, + "grad_norm": 1.8816713923393735, + "language_loss": 0.64608246, + "learning_rate": 1.836891938198608e-07, + "loss": 0.66606045, + "num_input_tokens_seen": 155921105, + "step": 7207, + "time_per_iteration": 2.7700514793395996 + }, + { + "auxiliary_loss_clip": 0.01116645, + "auxiliary_loss_mlp": 0.01084343, + "balance_loss_clip": 1.02512872, + "balance_loss_mlp": 1.00432968, + "epoch": 0.8667107557265676, + "flos": 18656419495680.0, + "grad_norm": 2.077413778414286, + "language_loss": 0.71118432, + "learning_rate": 1.8336322819397677e-07, + "loss": 0.73319417, + "num_input_tokens_seen": 155938640, + "step": 7208, + "time_per_iteration": 2.7354538440704346 + }, + { + "auxiliary_loss_clip": 0.01108482, + "auxiliary_loss_mlp": 0.01083932, + "balance_loss_clip": 1.02314997, + "balance_loss_mlp": 1.00372839, + "epoch": 0.8668309986172068, + "flos": 20083725302400.0, + "grad_norm": 1.8180849662564986, + "language_loss": 0.62840474, + "learning_rate": 1.8303753815044654e-07, + "loss": 0.65032893, + "num_input_tokens_seen": 155957945, + "step": 7209, + "time_per_iteration": 4.6372151374816895 + }, + { + "auxiliary_loss_clip": 0.01117581, + "auxiliary_loss_mlp": 0.01085074, + "balance_loss_clip": 1.0239979, + "balance_loss_mlp": 1.0046792, + "epoch": 0.8669512415078459, + "flos": 21615099788160.0, + "grad_norm": 2.1598051168074965, + "language_loss": 0.70460343, + "learning_rate": 1.827121237386773e-07, + "loss": 0.72663003, + "num_input_tokens_seen": 155975390, + "step": 7210, + "time_per_iteration": 2.7138397693634033 + }, + { + "auxiliary_loss_clip": 0.01114079, + "auxiliary_loss_mlp": 0.01085217, + "balance_loss_clip": 1.02235532, + "balance_loss_mlp": 1.00501323, + "epoch": 0.8670714843984849, + "flos": 17703601372800.0, + "grad_norm": 2.1683411382462014, + "language_loss": 0.7508868, + "learning_rate": 1.8238698500803374e-07, + "loss": 0.77287972, + "num_input_tokens_seen": 155988155, + "step": 7211, + "time_per_iteration": 3.5298473834991455 + }, + { + "auxiliary_loss_clip": 0.01104885, + "auxiliary_loss_mlp": 0.01079151, + "balance_loss_clip": 1.01747155, + "balance_loss_mlp": 1.00018716, + "epoch": 0.8671917272891241, + "flos": 60705483125760.0, + "grad_norm": 0.7183024860139843, + "language_loss": 0.56331813, + "learning_rate": 1.820621220078391e-07, + "loss": 0.58515859, + "num_input_tokens_seen": 156052065, + "step": 7212, + "time_per_iteration": 3.2946643829345703 + }, + { + "auxiliary_loss_clip": 0.01135231, + "auxiliary_loss_mlp": 0.0108401, + "balance_loss_clip": 1.02604008, + "balance_loss_mlp": 1.00390136, + "epoch": 0.8673119701797631, + "flos": 20451881750400.0, + "grad_norm": 1.5590866614613275, + "language_loss": 0.67950606, + "learning_rate": 1.8173753478737553e-07, + "loss": 0.70169848, + "num_input_tokens_seen": 156072500, + "step": 7213, + "time_per_iteration": 2.663299322128296 + }, + { + "auxiliary_loss_clip": 0.01135479, + "auxiliary_loss_mlp": 0.01084495, + "balance_loss_clip": 1.02614307, + "balance_loss_mlp": 1.00438619, + "epoch": 0.8674322130704022, + "flos": 19647410797440.0, + "grad_norm": 1.9840835178708962, + "language_loss": 0.79538953, + "learning_rate": 1.8141322339588205e-07, + "loss": 0.81758928, + "num_input_tokens_seen": 156089840, + "step": 7214, + "time_per_iteration": 3.495509624481201 + }, + { + "auxiliary_loss_clip": 0.0113541, + "auxiliary_loss_mlp": 0.01083874, + "balance_loss_clip": 1.02623677, + "balance_loss_mlp": 1.00390851, + "epoch": 0.8675524559610414, + "flos": 26025001367040.0, + "grad_norm": 1.711667640439726, + "language_loss": 0.70066005, + "learning_rate": 1.810891878825569e-07, + "loss": 0.72285295, + "num_input_tokens_seen": 156109815, + "step": 7215, + "time_per_iteration": 2.622082471847534 + }, + { + "auxiliary_loss_clip": 0.01114764, + "auxiliary_loss_mlp": 0.01084592, + "balance_loss_clip": 1.02301872, + "balance_loss_mlp": 1.00438833, + "epoch": 0.8676726988516804, + "flos": 15049444584960.0, + "grad_norm": 1.8187082263243068, + "language_loss": 0.71734643, + "learning_rate": 1.8076542829655561e-07, + "loss": 0.73934001, + "num_input_tokens_seen": 156128620, + "step": 7216, + "time_per_iteration": 2.656052827835083 + }, + { + "auxiliary_loss_clip": 0.01110442, + "auxiliary_loss_mlp": 0.01084283, + "balance_loss_clip": 1.02107334, + "balance_loss_mlp": 1.00403094, + "epoch": 0.8677929417423195, + "flos": 16288111140480.0, + "grad_norm": 2.609362809957646, + "language_loss": 0.7938509, + "learning_rate": 1.8044194468699203e-07, + "loss": 0.81579816, + "num_input_tokens_seen": 156145930, + "step": 7217, + "time_per_iteration": 2.6320929527282715 + }, + { + "auxiliary_loss_clip": 0.01116663, + "auxiliary_loss_mlp": 0.01083383, + "balance_loss_clip": 1.02578056, + "balance_loss_mlp": 1.003227, + "epoch": 0.8679131846329585, + "flos": 18844160906880.0, + "grad_norm": 2.2978137420660385, + "language_loss": 0.75822628, + "learning_rate": 1.8011873710293912e-07, + "loss": 0.78022671, + "num_input_tokens_seen": 156164435, + "step": 7218, + "time_per_iteration": 2.7563629150390625 + }, + { + "auxiliary_loss_clip": 0.0111949, + "auxiliary_loss_mlp": 0.01084853, + "balance_loss_clip": 1.02507854, + "balance_loss_mlp": 1.00469661, + "epoch": 0.8680334275235977, + "flos": 33620718890880.0, + "grad_norm": 2.2170300307556507, + "language_loss": 0.69445896, + "learning_rate": 1.7979580559342677e-07, + "loss": 0.71650243, + "num_input_tokens_seen": 156185165, + "step": 7219, + "time_per_iteration": 2.739778757095337 + }, + { + "auxiliary_loss_clip": 0.01118004, + "auxiliary_loss_mlp": 0.01085996, + "balance_loss_clip": 1.02573395, + "balance_loss_mlp": 1.00584006, + "epoch": 0.8681536704142367, + "flos": 24681152810880.0, + "grad_norm": 1.6073704283083132, + "language_loss": 0.66874152, + "learning_rate": 1.7947315020744358e-07, + "loss": 0.69078153, + "num_input_tokens_seen": 156206260, + "step": 7220, + "time_per_iteration": 2.740678071975708 + }, + { + "auxiliary_loss_clip": 0.01115234, + "auxiliary_loss_mlp": 0.01082944, + "balance_loss_clip": 1.02371323, + "balance_loss_mlp": 1.00283599, + "epoch": 0.8682739133048758, + "flos": 20011042131840.0, + "grad_norm": 1.8233630056132404, + "language_loss": 0.80215472, + "learning_rate": 1.7915077099393594e-07, + "loss": 0.8241365, + "num_input_tokens_seen": 156222860, + "step": 7221, + "time_per_iteration": 2.711913824081421 + }, + { + "auxiliary_loss_clip": 0.01127257, + "auxiliary_loss_mlp": 0.01084638, + "balance_loss_clip": 1.02612758, + "balance_loss_mlp": 1.00448143, + "epoch": 0.868394156195515, + "flos": 16654759217280.0, + "grad_norm": 2.727006769187476, + "language_loss": 0.73243922, + "learning_rate": 1.788286680018083e-07, + "loss": 0.75455815, + "num_input_tokens_seen": 156241570, + "step": 7222, + "time_per_iteration": 2.684468984603882 + }, + { + "auxiliary_loss_clip": 0.01117115, + "auxiliary_loss_mlp": 0.0108495, + "balance_loss_clip": 1.02482224, + "balance_loss_mlp": 1.00484133, + "epoch": 0.868514399086154, + "flos": 28001381448960.0, + "grad_norm": 1.5846361734174796, + "language_loss": 0.72082144, + "learning_rate": 1.7850684127992443e-07, + "loss": 0.74284214, + "num_input_tokens_seen": 156261315, + "step": 7223, + "time_per_iteration": 2.7434427738189697 + }, + { + "auxiliary_loss_clip": 0.01108268, + "auxiliary_loss_mlp": 0.01084522, + "balance_loss_clip": 1.02541161, + "balance_loss_mlp": 1.00446081, + "epoch": 0.8686346419767931, + "flos": 20084587228800.0, + "grad_norm": 1.5968869542386857, + "language_loss": 0.69688046, + "learning_rate": 1.7818529087710378e-07, + "loss": 0.71880835, + "num_input_tokens_seen": 156281670, + "step": 7224, + "time_per_iteration": 2.7088606357574463 + }, + { + "auxiliary_loss_clip": 0.01123435, + "auxiliary_loss_mlp": 0.00872946, + "balance_loss_clip": 1.02350163, + "balance_loss_mlp": 1.00005364, + "epoch": 0.8687548848674322, + "flos": 18223516782720.0, + "grad_norm": 1.67830336101393, + "language_loss": 0.84108865, + "learning_rate": 1.7786401684212637e-07, + "loss": 0.86105251, + "num_input_tokens_seen": 156300500, + "step": 7225, + "time_per_iteration": 2.6860992908477783 + }, + { + "auxiliary_loss_clip": 0.01067309, + "auxiliary_loss_mlp": 0.0107893, + "balance_loss_clip": 1.02248228, + "balance_loss_mlp": 0.99996632, + "epoch": 0.8688751277580713, + "flos": 70457885049600.0, + "grad_norm": 0.7310599982709162, + "language_loss": 0.55963761, + "learning_rate": 1.7754301922372883e-07, + "loss": 0.58110005, + "num_input_tokens_seen": 156350145, + "step": 7226, + "time_per_iteration": 3.1691250801086426 + }, + { + "auxiliary_loss_clip": 0.01084887, + "auxiliary_loss_mlp": 0.01084335, + "balance_loss_clip": 1.018996, + "balance_loss_mlp": 1.00427389, + "epoch": 0.8689953706487104, + "flos": 26906788344960.0, + "grad_norm": 1.7619883075911276, + "language_loss": 0.80709261, + "learning_rate": 1.7722229807060617e-07, + "loss": 0.82878488, + "num_input_tokens_seen": 156368725, + "step": 7227, + "time_per_iteration": 2.887014865875244 + }, + { + "auxiliary_loss_clip": 0.0110798, + "auxiliary_loss_mlp": 0.01083629, + "balance_loss_clip": 1.02369618, + "balance_loss_mlp": 1.00356793, + "epoch": 0.8691156135393495, + "flos": 34637385438720.0, + "grad_norm": 4.035327796523529, + "language_loss": 0.81846362, + "learning_rate": 1.7690185343141172e-07, + "loss": 0.84037971, + "num_input_tokens_seen": 156388640, + "step": 7228, + "time_per_iteration": 2.8192079067230225 + }, + { + "auxiliary_loss_clip": 0.01117623, + "auxiliary_loss_mlp": 0.01082855, + "balance_loss_clip": 1.02465296, + "balance_loss_mlp": 1.00284159, + "epoch": 0.8692358564299886, + "flos": 18989814556800.0, + "grad_norm": 1.8959153361110646, + "language_loss": 0.69703835, + "learning_rate": 1.7658168535475615e-07, + "loss": 0.71904308, + "num_input_tokens_seen": 156406425, + "step": 7229, + "time_per_iteration": 2.69360089302063 + }, + { + "auxiliary_loss_clip": 0.01116726, + "auxiliary_loss_mlp": 0.01084035, + "balance_loss_clip": 1.02485085, + "balance_loss_mlp": 1.00392663, + "epoch": 0.8693560993206276, + "flos": 30370839039360.0, + "grad_norm": 1.4880104092971134, + "language_loss": 0.64242625, + "learning_rate": 1.7626179388920948e-07, + "loss": 0.66443384, + "num_input_tokens_seen": 156427705, + "step": 7230, + "time_per_iteration": 2.8019182682037354 + }, + { + "auxiliary_loss_clip": 0.01115019, + "auxiliary_loss_mlp": 0.00872884, + "balance_loss_clip": 1.02395129, + "balance_loss_mlp": 1.00010037, + "epoch": 0.8694763422112668, + "flos": 27200430028800.0, + "grad_norm": 1.7315921028588654, + "language_loss": 0.80438071, + "learning_rate": 1.7594217908329866e-07, + "loss": 0.82425976, + "num_input_tokens_seen": 156449890, + "step": 7231, + "time_per_iteration": 2.79818058013916 + }, + { + "auxiliary_loss_clip": 0.0111729, + "auxiliary_loss_mlp": 0.01084321, + "balance_loss_clip": 1.02521563, + "balance_loss_mlp": 1.0042119, + "epoch": 0.8695965851019059, + "flos": 26139161767680.0, + "grad_norm": 2.5075765917279695, + "language_loss": 0.73921692, + "learning_rate": 1.7562284098550895e-07, + "loss": 0.76123309, + "num_input_tokens_seen": 156469600, + "step": 7232, + "time_per_iteration": 2.7605860233306885 + }, + { + "auxiliary_loss_clip": 0.01091519, + "auxiliary_loss_mlp": 0.01078941, + "balance_loss_clip": 1.01313436, + "balance_loss_mlp": 0.99997675, + "epoch": 0.8697168279925449, + "flos": 67332616456320.0, + "grad_norm": 0.8388977761510246, + "language_loss": 0.62261468, + "learning_rate": 1.753037796442838e-07, + "loss": 0.64431924, + "num_input_tokens_seen": 156529040, + "step": 7233, + "time_per_iteration": 3.2436556816101074 + }, + { + "auxiliary_loss_clip": 0.01134003, + "auxiliary_loss_mlp": 0.01084919, + "balance_loss_clip": 1.0248034, + "balance_loss_mlp": 1.00476325, + "epoch": 0.8698370708831841, + "flos": 19718693337600.0, + "grad_norm": 3.1800482523707676, + "language_loss": 0.75428545, + "learning_rate": 1.74984995108024e-07, + "loss": 0.7764746, + "num_input_tokens_seen": 156546970, + "step": 7234, + "time_per_iteration": 3.6295883655548096 + }, + { + "auxiliary_loss_clip": 0.01125294, + "auxiliary_loss_mlp": 0.01083388, + "balance_loss_clip": 1.02525902, + "balance_loss_mlp": 1.00327933, + "epoch": 0.8699573137738231, + "flos": 12859971068160.0, + "grad_norm": 1.9386960673885583, + "language_loss": 0.82909149, + "learning_rate": 1.7466648742508981e-07, + "loss": 0.85117829, + "num_input_tokens_seen": 156563155, + "step": 7235, + "time_per_iteration": 2.620448589324951 + }, + { + "auxiliary_loss_clip": 0.01114504, + "auxiliary_loss_mlp": 0.01084008, + "balance_loss_clip": 1.0242672, + "balance_loss_mlp": 1.00389957, + "epoch": 0.8700775566644622, + "flos": 17420733768960.0, + "grad_norm": 1.756066264226549, + "language_loss": 0.84624398, + "learning_rate": 1.7434825664379837e-07, + "loss": 0.86822915, + "num_input_tokens_seen": 156581660, + "step": 7236, + "time_per_iteration": 2.7245521545410156 + }, + { + "auxiliary_loss_clip": 0.01123588, + "auxiliary_loss_mlp": 0.01083814, + "balance_loss_clip": 1.02369893, + "balance_loss_mlp": 1.00360966, + "epoch": 0.8701977995551013, + "flos": 13735221770880.0, + "grad_norm": 6.547162335522579, + "language_loss": 0.8550539, + "learning_rate": 1.740303028124246e-07, + "loss": 0.87712795, + "num_input_tokens_seen": 156597720, + "step": 7237, + "time_per_iteration": 3.494504928588867 + }, + { + "auxiliary_loss_clip": 0.01085633, + "auxiliary_loss_mlp": 0.01084498, + "balance_loss_clip": 1.01969492, + "balance_loss_mlp": 1.00438905, + "epoch": 0.8703180424457404, + "flos": 30555707362560.0, + "grad_norm": 1.9726799377321913, + "language_loss": 0.75567138, + "learning_rate": 1.7371262597920212e-07, + "loss": 0.77737272, + "num_input_tokens_seen": 156619780, + "step": 7238, + "time_per_iteration": 2.894226312637329 + }, + { + "auxiliary_loss_clip": 0.01096297, + "auxiliary_loss_mlp": 0.01084369, + "balance_loss_clip": 1.02308655, + "balance_loss_mlp": 1.00435591, + "epoch": 0.8704382853363795, + "flos": 19608986223360.0, + "grad_norm": 1.7492319754366557, + "language_loss": 0.7629922, + "learning_rate": 1.7339522619232195e-07, + "loss": 0.78479886, + "num_input_tokens_seen": 156638160, + "step": 7239, + "time_per_iteration": 3.6501567363739014 + }, + { + "auxiliary_loss_clip": 0.01118325, + "auxiliary_loss_mlp": 0.01084041, + "balance_loss_clip": 1.02486849, + "balance_loss_mlp": 1.00383699, + "epoch": 0.8705585282270186, + "flos": 26613900846720.0, + "grad_norm": 1.8017610958285193, + "language_loss": 0.75419402, + "learning_rate": 1.730781034999338e-07, + "loss": 0.7762177, + "num_input_tokens_seen": 156659740, + "step": 7240, + "time_per_iteration": 2.789381504058838 + }, + { + "auxiliary_loss_clip": 0.01134772, + "auxiliary_loss_mlp": 0.01084035, + "balance_loss_clip": 1.02643013, + "balance_loss_mlp": 1.00402188, + "epoch": 0.8706787711176577, + "flos": 34090465979520.0, + "grad_norm": 2.8471862055772483, + "language_loss": 0.73325384, + "learning_rate": 1.7276125795014497e-07, + "loss": 0.7554419, + "num_input_tokens_seen": 156678190, + "step": 7241, + "time_per_iteration": 2.712968587875366 + }, + { + "auxiliary_loss_clip": 0.01118958, + "auxiliary_loss_mlp": 0.01083742, + "balance_loss_clip": 1.02544248, + "balance_loss_mlp": 1.00344253, + "epoch": 0.8707990140082967, + "flos": 14611513968000.0, + "grad_norm": 1.8787922313690346, + "language_loss": 0.67223877, + "learning_rate": 1.7244468959102054e-07, + "loss": 0.69426578, + "num_input_tokens_seen": 156695245, + "step": 7242, + "time_per_iteration": 2.675933837890625 + }, + { + "auxiliary_loss_clip": 0.01125235, + "auxiliary_loss_mlp": 0.01084819, + "balance_loss_clip": 1.02579737, + "balance_loss_mlp": 1.00461459, + "epoch": 0.8709192568989359, + "flos": 20084156265600.0, + "grad_norm": 2.493753019223673, + "language_loss": 0.85191858, + "learning_rate": 1.7212839847058348e-07, + "loss": 0.87401909, + "num_input_tokens_seen": 156710375, + "step": 7243, + "time_per_iteration": 2.6320948600769043 + }, + { + "auxiliary_loss_clip": 0.0106705, + "auxiliary_loss_mlp": 0.01084025, + "balance_loss_clip": 1.02172899, + "balance_loss_mlp": 1.00396383, + "epoch": 0.871039499789575, + "flos": 16727083251840.0, + "grad_norm": 4.166120800611645, + "language_loss": 0.7377398, + "learning_rate": 1.718123846368147e-07, + "loss": 0.75925052, + "num_input_tokens_seen": 156729420, + "step": 7244, + "time_per_iteration": 2.8228607177734375 + }, + { + "auxiliary_loss_clip": 0.01114242, + "auxiliary_loss_mlp": 0.00872816, + "balance_loss_clip": 1.02310419, + "balance_loss_mlp": 1.00005937, + "epoch": 0.871159742680214, + "flos": 21068790860160.0, + "grad_norm": 3.4345926671370646, + "language_loss": 0.7150619, + "learning_rate": 1.714966481376543e-07, + "loss": 0.73493242, + "num_input_tokens_seen": 156746100, + "step": 7245, + "time_per_iteration": 2.6601574420928955 + }, + { + "auxiliary_loss_clip": 0.01123953, + "auxiliary_loss_mlp": 0.01082877, + "balance_loss_clip": 1.02387798, + "balance_loss_mlp": 1.00272119, + "epoch": 0.8712799855708532, + "flos": 28256526731520.0, + "grad_norm": 1.9332579019160265, + "language_loss": 0.83083373, + "learning_rate": 1.7118118902099797e-07, + "loss": 0.85290205, + "num_input_tokens_seen": 156764185, + "step": 7246, + "time_per_iteration": 2.7983922958374023 + }, + { + "auxiliary_loss_clip": 0.01125581, + "auxiliary_loss_mlp": 0.01084393, + "balance_loss_clip": 1.02493227, + "balance_loss_mlp": 1.00433195, + "epoch": 0.8714002284614922, + "flos": 22236677665920.0, + "grad_norm": 1.5283742997817418, + "language_loss": 0.80831057, + "learning_rate": 1.7086600733470146e-07, + "loss": 0.8304103, + "num_input_tokens_seen": 156784855, + "step": 7247, + "time_per_iteration": 2.6872506141662598 + }, + { + "auxiliary_loss_clip": 0.01124298, + "auxiliary_loss_mlp": 0.01083549, + "balance_loss_clip": 1.02522087, + "balance_loss_mlp": 1.00348783, + "epoch": 0.8715204713521313, + "flos": 21431919404160.0, + "grad_norm": 1.7823187517692367, + "language_loss": 0.76813245, + "learning_rate": 1.7055110312657738e-07, + "loss": 0.7902109, + "num_input_tokens_seen": 156804350, + "step": 7248, + "time_per_iteration": 2.681215763092041 + }, + { + "auxiliary_loss_clip": 0.01116035, + "auxiliary_loss_mlp": 0.0108429, + "balance_loss_clip": 1.02398384, + "balance_loss_mlp": 1.00413358, + "epoch": 0.8716407142427703, + "flos": 23440439180160.0, + "grad_norm": 2.548284784824158, + "language_loss": 0.74137199, + "learning_rate": 1.702364764443962e-07, + "loss": 0.76337522, + "num_input_tokens_seen": 156823425, + "step": 7249, + "time_per_iteration": 2.8010823726654053 + }, + { + "auxiliary_loss_clip": 0.01088898, + "auxiliary_loss_mlp": 0.01084231, + "balance_loss_clip": 1.023193, + "balance_loss_mlp": 1.00412202, + "epoch": 0.8717609571334095, + "flos": 27958683156480.0, + "grad_norm": 2.134542592960157, + "language_loss": 0.72328317, + "learning_rate": 1.6992212733588685e-07, + "loss": 0.74501443, + "num_input_tokens_seen": 156843090, + "step": 7250, + "time_per_iteration": 2.8519210815429688 + }, + { + "auxiliary_loss_clip": 0.01118302, + "auxiliary_loss_mlp": 0.01084334, + "balance_loss_clip": 1.02551329, + "balance_loss_mlp": 1.00417745, + "epoch": 0.8718812000240486, + "flos": 25479482538240.0, + "grad_norm": 1.7201545668439258, + "language_loss": 0.75101191, + "learning_rate": 1.6960805584873538e-07, + "loss": 0.77303827, + "num_input_tokens_seen": 156861090, + "step": 7251, + "time_per_iteration": 2.7526237964630127 + }, + { + "auxiliary_loss_clip": 0.01096168, + "auxiliary_loss_mlp": 0.01083826, + "balance_loss_clip": 1.02210212, + "balance_loss_mlp": 1.0036695, + "epoch": 0.8720014429146876, + "flos": 23403056100480.0, + "grad_norm": 1.4372160665974267, + "language_loss": 0.78195894, + "learning_rate": 1.6929426203058684e-07, + "loss": 0.80375886, + "num_input_tokens_seen": 156881515, + "step": 7252, + "time_per_iteration": 2.8677279949188232 + }, + { + "auxiliary_loss_clip": 0.01133998, + "auxiliary_loss_mlp": 0.00873007, + "balance_loss_clip": 1.02457678, + "balance_loss_mlp": 1.00005019, + "epoch": 0.8721216858053268, + "flos": 24352821567360.0, + "grad_norm": 2.524179478723384, + "language_loss": 0.80097151, + "learning_rate": 1.689807459290431e-07, + "loss": 0.82104152, + "num_input_tokens_seen": 156900170, + "step": 7253, + "time_per_iteration": 2.6750905513763428 + }, + { + "auxiliary_loss_clip": 0.01109217, + "auxiliary_loss_mlp": 0.01083258, + "balance_loss_clip": 1.02338719, + "balance_loss_mlp": 1.00305343, + "epoch": 0.8722419286959658, + "flos": 33869687034240.0, + "grad_norm": 2.2807254780789137, + "language_loss": 0.70646548, + "learning_rate": 1.6866750759166437e-07, + "loss": 0.72839022, + "num_input_tokens_seen": 156920150, + "step": 7254, + "time_per_iteration": 2.8461453914642334 + }, + { + "auxiliary_loss_clip": 0.01107789, + "auxiliary_loss_mlp": 0.0108309, + "balance_loss_clip": 1.02357447, + "balance_loss_mlp": 1.00298142, + "epoch": 0.8723621715866049, + "flos": 18369385914240.0, + "grad_norm": 2.18168305502712, + "language_loss": 0.77685142, + "learning_rate": 1.6835454706596865e-07, + "loss": 0.79876024, + "num_input_tokens_seen": 156937980, + "step": 7255, + "time_per_iteration": 2.7434253692626953 + }, + { + "auxiliary_loss_clip": 0.01135621, + "auxiliary_loss_mlp": 0.01084237, + "balance_loss_clip": 1.02649808, + "balance_loss_mlp": 1.00403273, + "epoch": 0.8724824144772441, + "flos": 22013348855040.0, + "grad_norm": 1.707861288618789, + "language_loss": 0.7370013, + "learning_rate": 1.680418643994317e-07, + "loss": 0.75919986, + "num_input_tokens_seen": 156956550, + "step": 7256, + "time_per_iteration": 2.624300003051758 + }, + { + "auxiliary_loss_clip": 0.01112721, + "auxiliary_loss_mlp": 0.01078985, + "balance_loss_clip": 1.0175941, + "balance_loss_mlp": 1.00002122, + "epoch": 0.8726026573678831, + "flos": 66698720213760.0, + "grad_norm": 0.8906886218570184, + "language_loss": 0.6456719, + "learning_rate": 1.6772945963948738e-07, + "loss": 0.66758901, + "num_input_tokens_seen": 157014715, + "step": 7257, + "time_per_iteration": 3.1890692710876465 + }, + { + "auxiliary_loss_clip": 0.01108265, + "auxiliary_loss_mlp": 0.01084879, + "balance_loss_clip": 1.02305579, + "balance_loss_mlp": 1.00477016, + "epoch": 0.8727229002585222, + "flos": 13370908078080.0, + "grad_norm": 2.1810203396705012, + "language_loss": 0.77159536, + "learning_rate": 1.6741733283352733e-07, + "loss": 0.79352683, + "num_input_tokens_seen": 157032320, + "step": 7258, + "time_per_iteration": 2.7445857524871826 + }, + { + "auxiliary_loss_clip": 0.01079182, + "auxiliary_loss_mlp": 0.0108333, + "balance_loss_clip": 1.02193403, + "balance_loss_mlp": 1.00317419, + "epoch": 0.8728431431491613, + "flos": 21796987282560.0, + "grad_norm": 1.4640642941939197, + "language_loss": 0.83943218, + "learning_rate": 1.6710548402890102e-07, + "loss": 0.86105728, + "num_input_tokens_seen": 157052845, + "step": 7259, + "time_per_iteration": 4.5279786586761475 + }, + { + "auxiliary_loss_clip": 0.01135373, + "auxiliary_loss_mlp": 0.01084431, + "balance_loss_clip": 1.02545512, + "balance_loss_mlp": 1.00413191, + "epoch": 0.8729633860398004, + "flos": 36173823742080.0, + "grad_norm": 2.4043870425112814, + "language_loss": 0.66830885, + "learning_rate": 1.6679391327291527e-07, + "loss": 0.69050694, + "num_input_tokens_seen": 157074050, + "step": 7260, + "time_per_iteration": 2.771197557449341 + }, + { + "auxiliary_loss_clip": 0.0111894, + "auxiliary_loss_mlp": 0.01083388, + "balance_loss_clip": 1.02534604, + "balance_loss_mlp": 1.00342202, + "epoch": 0.8730836289304394, + "flos": 16359680989440.0, + "grad_norm": 2.5129538830558062, + "language_loss": 0.68304443, + "learning_rate": 1.6648262061283492e-07, + "loss": 0.70506775, + "num_input_tokens_seen": 157089350, + "step": 7261, + "time_per_iteration": 2.6749448776245117 + }, + { + "auxiliary_loss_clip": 0.01108244, + "auxiliary_loss_mlp": 0.01084077, + "balance_loss_clip": 1.02358258, + "balance_loss_mlp": 1.00387263, + "epoch": 0.8732038718210786, + "flos": 21215126868480.0, + "grad_norm": 2.394305548454794, + "language_loss": 0.7323122, + "learning_rate": 1.6617160609588353e-07, + "loss": 0.75423539, + "num_input_tokens_seen": 157108525, + "step": 7262, + "time_per_iteration": 3.6773264408111572 + }, + { + "auxiliary_loss_clip": 0.01099381, + "auxiliary_loss_mlp": 0.01084479, + "balance_loss_clip": 1.02464938, + "balance_loss_mlp": 1.00427532, + "epoch": 0.8733241147117177, + "flos": 16610696208000.0, + "grad_norm": 2.244386321530919, + "language_loss": 0.72129899, + "learning_rate": 1.6586086976924163e-07, + "loss": 0.7431376, + "num_input_tokens_seen": 157124025, + "step": 7263, + "time_per_iteration": 2.7080416679382324 + }, + { + "auxiliary_loss_clip": 0.01125646, + "auxiliary_loss_mlp": 0.01083448, + "balance_loss_clip": 1.02485025, + "balance_loss_mlp": 1.00343466, + "epoch": 0.8734443576023567, + "flos": 20193935207040.0, + "grad_norm": 1.6803359524114716, + "language_loss": 0.77968848, + "learning_rate": 1.6555041168004747e-07, + "loss": 0.80177939, + "num_input_tokens_seen": 157143345, + "step": 7264, + "time_per_iteration": 2.6930580139160156 + }, + { + "auxiliary_loss_clip": 0.01115535, + "auxiliary_loss_mlp": 0.01083921, + "balance_loss_clip": 1.02459228, + "balance_loss_mlp": 1.00376487, + "epoch": 0.8735646004929959, + "flos": 18041162411520.0, + "grad_norm": 2.3360099101051057, + "language_loss": 0.68884468, + "learning_rate": 1.6524023187539715e-07, + "loss": 0.71083927, + "num_input_tokens_seen": 157161630, + "step": 7265, + "time_per_iteration": 3.5379157066345215 + }, + { + "auxiliary_loss_clip": 0.011169, + "auxiliary_loss_mlp": 0.01084202, + "balance_loss_clip": 1.02454317, + "balance_loss_mlp": 1.00404525, + "epoch": 0.873684843383635, + "flos": 20262344659200.0, + "grad_norm": 2.0547300001384796, + "language_loss": 0.74747354, + "learning_rate": 1.649303304023446e-07, + "loss": 0.76948452, + "num_input_tokens_seen": 157181385, + "step": 7266, + "time_per_iteration": 2.8095083236694336 + }, + { + "auxiliary_loss_clip": 0.01103473, + "auxiliary_loss_mlp": 0.01084319, + "balance_loss_clip": 1.02368414, + "balance_loss_mlp": 1.00430548, + "epoch": 0.873805086274274, + "flos": 16947287579520.0, + "grad_norm": 1.764495523550111, + "language_loss": 0.78722918, + "learning_rate": 1.6462070730790246e-07, + "loss": 0.80910707, + "num_input_tokens_seen": 157200545, + "step": 7267, + "time_per_iteration": 2.7061803340911865 + }, + { + "auxiliary_loss_clip": 0.01117753, + "auxiliary_loss_mlp": 0.01084126, + "balance_loss_clip": 1.02469599, + "balance_loss_mlp": 1.00392199, + "epoch": 0.8739253291649132, + "flos": 18041270152320.0, + "grad_norm": 2.475474148397148, + "language_loss": 0.78551823, + "learning_rate": 1.6431136263903912e-07, + "loss": 0.80753702, + "num_input_tokens_seen": 157219545, + "step": 7268, + "time_per_iteration": 2.708615779876709 + }, + { + "auxiliary_loss_clip": 0.01126671, + "auxiliary_loss_mlp": 0.00872907, + "balance_loss_clip": 1.02531338, + "balance_loss_mlp": 1.00009096, + "epoch": 0.8740455720555522, + "flos": 21325085377920.0, + "grad_norm": 1.816464943883555, + "language_loss": 0.73663521, + "learning_rate": 1.6400229644268282e-07, + "loss": 0.75663096, + "num_input_tokens_seen": 157237900, + "step": 7269, + "time_per_iteration": 2.659775495529175 + }, + { + "auxiliary_loss_clip": 0.01093501, + "auxiliary_loss_mlp": 0.01084582, + "balance_loss_clip": 1.02009296, + "balance_loss_mlp": 1.00447369, + "epoch": 0.8741658149461913, + "flos": 15158684822400.0, + "grad_norm": 2.0584632489364107, + "language_loss": 0.81312919, + "learning_rate": 1.6369350876571852e-07, + "loss": 0.83491004, + "num_input_tokens_seen": 157256055, + "step": 7270, + "time_per_iteration": 2.7710797786712646 + }, + { + "auxiliary_loss_clip": 0.01099271, + "auxiliary_loss_mlp": 0.01083999, + "balance_loss_clip": 1.02350068, + "balance_loss_mlp": 1.00384235, + "epoch": 0.8742860578368304, + "flos": 23039855729280.0, + "grad_norm": 1.9176753419213959, + "language_loss": 0.81496859, + "learning_rate": 1.6338499965498874e-07, + "loss": 0.83680129, + "num_input_tokens_seen": 157274785, + "step": 7271, + "time_per_iteration": 2.8205785751342773 + }, + { + "auxiliary_loss_clip": 0.01103498, + "auxiliary_loss_mlp": 0.01083984, + "balance_loss_clip": 1.02055919, + "balance_loss_mlp": 1.00382781, + "epoch": 0.8744063007274695, + "flos": 28145347159680.0, + "grad_norm": 1.474970839208468, + "language_loss": 0.77729398, + "learning_rate": 1.630767691572943e-07, + "loss": 0.79916883, + "num_input_tokens_seen": 157294805, + "step": 7272, + "time_per_iteration": 2.79125714302063 + }, + { + "auxiliary_loss_clip": 0.0109676, + "auxiliary_loss_mlp": 0.01078904, + "balance_loss_clip": 1.01746941, + "balance_loss_mlp": 0.99993992, + "epoch": 0.8745265436181086, + "flos": 64034076654720.0, + "grad_norm": 0.7454030046864397, + "language_loss": 0.53544468, + "learning_rate": 1.6276881731939306e-07, + "loss": 0.55720133, + "num_input_tokens_seen": 157356695, + "step": 7273, + "time_per_iteration": 3.318110466003418 + }, + { + "auxiliary_loss_clip": 0.0112395, + "auxiliary_loss_mlp": 0.01084222, + "balance_loss_clip": 1.02419353, + "balance_loss_mlp": 1.0041132, + "epoch": 0.8746467865087477, + "flos": 28658618553600.0, + "grad_norm": 2.8272383083362946, + "language_loss": 0.75500298, + "learning_rate": 1.6246114418800193e-07, + "loss": 0.77708471, + "num_input_tokens_seen": 157376975, + "step": 7274, + "time_per_iteration": 2.73071551322937 + }, + { + "auxiliary_loss_clip": 0.01126336, + "auxiliary_loss_mlp": 0.01084284, + "balance_loss_clip": 1.02488589, + "balance_loss_mlp": 1.00417566, + "epoch": 0.8747670293993868, + "flos": 23985850268160.0, + "grad_norm": 1.6561313725681057, + "language_loss": 0.76339138, + "learning_rate": 1.6215374980979423e-07, + "loss": 0.78549755, + "num_input_tokens_seen": 157397385, + "step": 7275, + "time_per_iteration": 2.748933792114258 + }, + { + "auxiliary_loss_clip": 0.0111964, + "auxiliary_loss_mlp": 0.01084098, + "balance_loss_clip": 1.0257833, + "balance_loss_mlp": 1.00403714, + "epoch": 0.8748872722900258, + "flos": 45221624478720.0, + "grad_norm": 2.3965319411590618, + "language_loss": 0.68437219, + "learning_rate": 1.6184663423140133e-07, + "loss": 0.70640963, + "num_input_tokens_seen": 157417685, + "step": 7276, + "time_per_iteration": 2.935412645339966 + }, + { + "auxiliary_loss_clip": 0.01100969, + "auxiliary_loss_mlp": 0.0108518, + "balance_loss_clip": 1.02523756, + "balance_loss_mlp": 1.0050236, + "epoch": 0.875007515180665, + "flos": 19754280737280.0, + "grad_norm": 1.8356769296971218, + "language_loss": 0.64310312, + "learning_rate": 1.615397974994126e-07, + "loss": 0.66496468, + "num_input_tokens_seen": 157435490, + "step": 7277, + "time_per_iteration": 2.836475372314453 + }, + { + "auxiliary_loss_clip": 0.01134693, + "auxiliary_loss_mlp": 0.01084235, + "balance_loss_clip": 1.02566361, + "balance_loss_mlp": 1.0042212, + "epoch": 0.875127758071304, + "flos": 22710734386560.0, + "grad_norm": 1.5226594214251692, + "language_loss": 0.80725515, + "learning_rate": 1.6123323966037438e-07, + "loss": 0.82944441, + "num_input_tokens_seen": 157454010, + "step": 7278, + "time_per_iteration": 2.676494598388672 + }, + { + "auxiliary_loss_clip": 0.01136305, + "auxiliary_loss_mlp": 0.01084908, + "balance_loss_clip": 1.02706242, + "balance_loss_mlp": 1.00479913, + "epoch": 0.8752480009619431, + "flos": 23403846199680.0, + "grad_norm": 2.4240551824299787, + "language_loss": 0.78866661, + "learning_rate": 1.6092696076079216e-07, + "loss": 0.81087875, + "num_input_tokens_seen": 157472385, + "step": 7279, + "time_per_iteration": 2.646796226501465 + }, + { + "auxiliary_loss_clip": 0.01104984, + "auxiliary_loss_mlp": 0.01084174, + "balance_loss_clip": 1.02288163, + "balance_loss_mlp": 1.00416088, + "epoch": 0.8753682438525822, + "flos": 26213101914240.0, + "grad_norm": 1.6419154919498775, + "language_loss": 0.73677039, + "learning_rate": 1.6062096084712785e-07, + "loss": 0.75866205, + "num_input_tokens_seen": 157493735, + "step": 7280, + "time_per_iteration": 2.73679780960083 + }, + { + "auxiliary_loss_clip": 0.01118203, + "auxiliary_loss_mlp": 0.00872949, + "balance_loss_clip": 1.02494228, + "balance_loss_mlp": 1.00013614, + "epoch": 0.8754884867432213, + "flos": 23326745656320.0, + "grad_norm": 1.8741594055290058, + "language_loss": 0.70549792, + "learning_rate": 1.6031523996580098e-07, + "loss": 0.72540945, + "num_input_tokens_seen": 157511295, + "step": 7281, + "time_per_iteration": 2.7120625972747803 + }, + { + "auxiliary_loss_clip": 0.01108112, + "auxiliary_loss_mlp": 0.01083205, + "balance_loss_clip": 1.02390528, + "balance_loss_mlp": 1.00304818, + "epoch": 0.8756087296338604, + "flos": 12495226412160.0, + "grad_norm": 2.277504865583755, + "language_loss": 0.66235447, + "learning_rate": 1.6000979816318981e-07, + "loss": 0.6842677, + "num_input_tokens_seen": 157529760, + "step": 7282, + "time_per_iteration": 2.738302707672119 + }, + { + "auxiliary_loss_clip": 0.01118471, + "auxiliary_loss_mlp": 0.01084402, + "balance_loss_clip": 1.02440858, + "balance_loss_mlp": 1.004246, + "epoch": 0.8757289725244994, + "flos": 18952898353920.0, + "grad_norm": 2.49048887760923, + "language_loss": 0.74466598, + "learning_rate": 1.5970463548562886e-07, + "loss": 0.76669466, + "num_input_tokens_seen": 157548915, + "step": 7283, + "time_per_iteration": 3.493313789367676 + }, + { + "auxiliary_loss_clip": 0.01112933, + "auxiliary_loss_mlp": 0.01083684, + "balance_loss_clip": 1.02264273, + "balance_loss_mlp": 1.00357497, + "epoch": 0.8758492154151386, + "flos": 25265958140160.0, + "grad_norm": 1.6715169505296998, + "language_loss": 0.70822656, + "learning_rate": 1.5939975197941192e-07, + "loss": 0.73019278, + "num_input_tokens_seen": 157570570, + "step": 7284, + "time_per_iteration": 2.7721643447875977 + }, + { + "auxiliary_loss_clip": 0.01096827, + "auxiliary_loss_mlp": 0.0107893, + "balance_loss_clip": 1.01754498, + "balance_loss_mlp": 0.99996597, + "epoch": 0.8759694583057777, + "flos": 65571664193280.0, + "grad_norm": 0.8871911044149886, + "language_loss": 0.53423476, + "learning_rate": 1.5909514769078892e-07, + "loss": 0.55599225, + "num_input_tokens_seen": 157635675, + "step": 7285, + "time_per_iteration": 4.244749546051025 + }, + { + "auxiliary_loss_clip": 0.01101654, + "auxiliary_loss_mlp": 0.01084411, + "balance_loss_clip": 1.02138186, + "balance_loss_mlp": 1.00439799, + "epoch": 0.8760897011964167, + "flos": 25446193608960.0, + "grad_norm": 1.4435366382428494, + "language_loss": 0.77728665, + "learning_rate": 1.5879082266596867e-07, + "loss": 0.79914731, + "num_input_tokens_seen": 157657015, + "step": 7286, + "time_per_iteration": 2.864098072052002 + }, + { + "auxiliary_loss_clip": 0.01116285, + "auxiliary_loss_mlp": 0.01084141, + "balance_loss_clip": 1.02358341, + "balance_loss_mlp": 1.00403261, + "epoch": 0.8762099440870559, + "flos": 28984830894720.0, + "grad_norm": 1.6782515339495692, + "language_loss": 0.71820951, + "learning_rate": 1.5848677695111645e-07, + "loss": 0.74021375, + "num_input_tokens_seen": 157678615, + "step": 7287, + "time_per_iteration": 2.8028054237365723 + }, + { + "auxiliary_loss_clip": 0.01107654, + "auxiliary_loss_mlp": 0.01085062, + "balance_loss_clip": 1.02407908, + "balance_loss_mlp": 1.00481009, + "epoch": 0.8763301869776949, + "flos": 21609461352960.0, + "grad_norm": 2.3878196843509856, + "language_loss": 0.69740421, + "learning_rate": 1.5818301059235562e-07, + "loss": 0.71933138, + "num_input_tokens_seen": 157693790, + "step": 7288, + "time_per_iteration": 3.6317574977874756 + }, + { + "auxiliary_loss_clip": 0.01113879, + "auxiliary_loss_mlp": 0.01083947, + "balance_loss_clip": 1.02286005, + "balance_loss_mlp": 1.00383818, + "epoch": 0.876450429868334, + "flos": 24644416176000.0, + "grad_norm": 1.595452497108434, + "language_loss": 0.81509066, + "learning_rate": 1.578795236357684e-07, + "loss": 0.83706892, + "num_input_tokens_seen": 157715255, + "step": 7289, + "time_per_iteration": 2.773155689239502 + }, + { + "auxiliary_loss_clip": 0.01115431, + "auxiliary_loss_mlp": 0.01084009, + "balance_loss_clip": 1.02493739, + "balance_loss_mlp": 1.00390053, + "epoch": 0.8765706727589732, + "flos": 20260046188800.0, + "grad_norm": 2.097768901686803, + "language_loss": 0.85404944, + "learning_rate": 1.5757631612739218e-07, + "loss": 0.8760438, + "num_input_tokens_seen": 157728800, + "step": 7290, + "time_per_iteration": 3.5989880561828613 + }, + { + "auxiliary_loss_clip": 0.01112733, + "auxiliary_loss_mlp": 0.01079094, + "balance_loss_clip": 1.0176059, + "balance_loss_mlp": 1.00013018, + "epoch": 0.8766909156496122, + "flos": 71371165276800.0, + "grad_norm": 0.7865079695761803, + "language_loss": 0.61478627, + "learning_rate": 1.572733881132242e-07, + "loss": 0.63670456, + "num_input_tokens_seen": 157789445, + "step": 7291, + "time_per_iteration": 3.2649099826812744 + }, + { + "auxiliary_loss_clip": 0.01083422, + "auxiliary_loss_mlp": 0.01078946, + "balance_loss_clip": 1.0132246, + "balance_loss_mlp": 0.99998134, + "epoch": 0.8768111585402513, + "flos": 69523490603520.0, + "grad_norm": 1.3959877749541743, + "language_loss": 0.58485025, + "learning_rate": 1.5697073963921814e-07, + "loss": 0.60647392, + "num_input_tokens_seen": 157848685, + "step": 7292, + "time_per_iteration": 3.2429747581481934 + }, + { + "auxiliary_loss_clip": 0.01123913, + "auxiliary_loss_mlp": 0.01083829, + "balance_loss_clip": 1.02449167, + "balance_loss_mlp": 1.00376832, + "epoch": 0.8769314014308904, + "flos": 18838558385280.0, + "grad_norm": 2.6792672830843656, + "language_loss": 0.84715998, + "learning_rate": 1.566683707512857e-07, + "loss": 0.86923742, + "num_input_tokens_seen": 157866360, + "step": 7293, + "time_per_iteration": 2.661182165145874 + }, + { + "auxiliary_loss_clip": 0.01116545, + "auxiliary_loss_mlp": 0.01084108, + "balance_loss_clip": 1.02435446, + "balance_loss_mlp": 1.00395203, + "epoch": 0.8770516443215295, + "flos": 14976402278400.0, + "grad_norm": 1.8708364426374613, + "language_loss": 0.79279917, + "learning_rate": 1.5636628149529553e-07, + "loss": 0.81480569, + "num_input_tokens_seen": 157884150, + "step": 7294, + "time_per_iteration": 2.6303863525390625 + }, + { + "auxiliary_loss_clip": 0.01109813, + "auxiliary_loss_mlp": 0.01084447, + "balance_loss_clip": 1.02336454, + "balance_loss_mlp": 1.00433826, + "epoch": 0.8771718872121685, + "flos": 31649654021760.0, + "grad_norm": 1.9940295405894952, + "language_loss": 0.79596972, + "learning_rate": 1.560644719170743e-07, + "loss": 0.81791234, + "num_input_tokens_seen": 157905020, + "step": 7295, + "time_per_iteration": 2.8165080547332764 + }, + { + "auxiliary_loss_clip": 0.0110766, + "auxiliary_loss_mlp": 0.01084425, + "balance_loss_clip": 1.02327871, + "balance_loss_mlp": 1.00426841, + "epoch": 0.8772921301028077, + "flos": 36095466222720.0, + "grad_norm": 1.6862407151466843, + "language_loss": 0.7208997, + "learning_rate": 1.5576294206240692e-07, + "loss": 0.74282056, + "num_input_tokens_seen": 157924545, + "step": 7296, + "time_per_iteration": 2.8520665168762207 + }, + { + "auxiliary_loss_clip": 0.01111202, + "auxiliary_loss_mlp": 0.01084037, + "balance_loss_clip": 1.0207727, + "balance_loss_mlp": 1.00392854, + "epoch": 0.8774123729934468, + "flos": 57116961849600.0, + "grad_norm": 1.6059873958866484, + "language_loss": 0.67478967, + "learning_rate": 1.5546169197703507e-07, + "loss": 0.69674206, + "num_input_tokens_seen": 157950820, + "step": 7297, + "time_per_iteration": 3.0625529289245605 + }, + { + "auxiliary_loss_clip": 0.01117635, + "auxiliary_loss_mlp": 0.01084311, + "balance_loss_clip": 1.02429891, + "balance_loss_mlp": 1.00420225, + "epoch": 0.8775326158840858, + "flos": 23914495900800.0, + "grad_norm": 2.460154562026785, + "language_loss": 0.77796537, + "learning_rate": 1.5516072170665774e-07, + "loss": 0.79998481, + "num_input_tokens_seen": 157968790, + "step": 7298, + "time_per_iteration": 2.77327561378479 + }, + { + "auxiliary_loss_clip": 0.0112682, + "auxiliary_loss_mlp": 0.01084849, + "balance_loss_clip": 1.02626848, + "balance_loss_mlp": 1.00474, + "epoch": 0.877652858774725, + "flos": 17123285243520.0, + "grad_norm": 1.7935865152490553, + "language_loss": 0.86598486, + "learning_rate": 1.5486003129693214e-07, + "loss": 0.88810146, + "num_input_tokens_seen": 157986155, + "step": 7299, + "time_per_iteration": 2.6526730060577393 + }, + { + "auxiliary_loss_clip": 0.01126037, + "auxiliary_loss_mlp": 0.01083818, + "balance_loss_clip": 1.02490115, + "balance_loss_mlp": 1.00370955, + "epoch": 0.877773101665364, + "flos": 16508961912960.0, + "grad_norm": 2.0641184338595506, + "language_loss": 0.78120697, + "learning_rate": 1.545596207934725e-07, + "loss": 0.80330551, + "num_input_tokens_seen": 158004640, + "step": 7300, + "time_per_iteration": 2.652188777923584 + }, + { + "auxiliary_loss_clip": 0.0111669, + "auxiliary_loss_mlp": 0.01083991, + "balance_loss_clip": 1.02380562, + "balance_loss_mlp": 1.00393009, + "epoch": 0.8778933445560031, + "flos": 22053209973120.0, + "grad_norm": 2.2520801758413445, + "language_loss": 0.77803582, + "learning_rate": 1.5425949024185147e-07, + "loss": 0.80004263, + "num_input_tokens_seen": 158024665, + "step": 7301, + "time_per_iteration": 2.7294535636901855 + }, + { + "auxiliary_loss_clip": 0.01118662, + "auxiliary_loss_mlp": 0.01084592, + "balance_loss_clip": 1.02580261, + "balance_loss_mlp": 1.00457883, + "epoch": 0.8780135874466423, + "flos": 22564757514240.0, + "grad_norm": 1.7547729623767214, + "language_loss": 0.67476439, + "learning_rate": 1.5395963968759818e-07, + "loss": 0.69679689, + "num_input_tokens_seen": 158044940, + "step": 7302, + "time_per_iteration": 2.7647390365600586 + }, + { + "auxiliary_loss_clip": 0.01115085, + "auxiliary_loss_mlp": 0.01083198, + "balance_loss_clip": 1.02307749, + "balance_loss_mlp": 1.00313675, + "epoch": 0.8781338303372813, + "flos": 61531999073280.0, + "grad_norm": 1.4013108810947839, + "language_loss": 0.64440274, + "learning_rate": 1.536600691761998e-07, + "loss": 0.66638553, + "num_input_tokens_seen": 158070770, + "step": 7303, + "time_per_iteration": 3.1996912956237793 + }, + { + "auxiliary_loss_clip": 0.01106368, + "auxiliary_loss_mlp": 0.01084135, + "balance_loss_clip": 1.02389526, + "balance_loss_mlp": 1.00416899, + "epoch": 0.8782540732279204, + "flos": 22674751937280.0, + "grad_norm": 1.6650100800442664, + "language_loss": 0.71656096, + "learning_rate": 1.5336077875310084e-07, + "loss": 0.73846591, + "num_input_tokens_seen": 158089995, + "step": 7304, + "time_per_iteration": 2.781435251235962 + }, + { + "auxiliary_loss_clip": 0.0109844, + "auxiliary_loss_mlp": 0.01082814, + "balance_loss_clip": 1.02324975, + "balance_loss_mlp": 1.00275326, + "epoch": 0.8783743161185595, + "flos": 16070348937600.0, + "grad_norm": 1.8351562772224637, + "language_loss": 0.73860151, + "learning_rate": 1.5306176846370321e-07, + "loss": 0.76041412, + "num_input_tokens_seen": 158108140, + "step": 7305, + "time_per_iteration": 2.6974265575408936 + }, + { + "auxiliary_loss_clip": 0.01117871, + "auxiliary_loss_mlp": 0.01083621, + "balance_loss_clip": 1.02495658, + "balance_loss_mlp": 1.0035125, + "epoch": 0.8784945590091986, + "flos": 26067879227520.0, + "grad_norm": 2.3256706766310327, + "language_loss": 0.74141538, + "learning_rate": 1.5276303835336712e-07, + "loss": 0.7634303, + "num_input_tokens_seen": 158128680, + "step": 7306, + "time_per_iteration": 2.7729532718658447 + }, + { + "auxiliary_loss_clip": 0.01105004, + "auxiliary_loss_mlp": 0.01079119, + "balance_loss_clip": 1.01760721, + "balance_loss_mlp": 1.00015438, + "epoch": 0.8786148018998376, + "flos": 62720643939840.0, + "grad_norm": 0.7703292887648858, + "language_loss": 0.53546417, + "learning_rate": 1.524645884674094e-07, + "loss": 0.5573054, + "num_input_tokens_seen": 158185610, + "step": 7307, + "time_per_iteration": 3.2480924129486084 + }, + { + "auxiliary_loss_clip": 0.01134547, + "auxiliary_loss_mlp": 0.0087306, + "balance_loss_clip": 1.02533507, + "balance_loss_mlp": 1.00002706, + "epoch": 0.8787350447904768, + "flos": 21652734263040.0, + "grad_norm": 2.4152746480093636, + "language_loss": 0.78851104, + "learning_rate": 1.521664188511047e-07, + "loss": 0.80858713, + "num_input_tokens_seen": 158205635, + "step": 7308, + "time_per_iteration": 2.6597626209259033 + }, + { + "auxiliary_loss_clip": 0.01111277, + "auxiliary_loss_mlp": 0.00872872, + "balance_loss_clip": 1.02109528, + "balance_loss_mlp": 1.00012755, + "epoch": 0.8788552876811159, + "flos": 25478476957440.0, + "grad_norm": 2.518953127153579, + "language_loss": 0.80318439, + "learning_rate": 1.518685295496851e-07, + "loss": 0.82302582, + "num_input_tokens_seen": 158223495, + "step": 7309, + "time_per_iteration": 3.720979690551758 + }, + { + "auxiliary_loss_clip": 0.01126262, + "auxiliary_loss_mlp": 0.01083151, + "balance_loss_clip": 1.02517784, + "balance_loss_mlp": 1.00313807, + "epoch": 0.8789755305717549, + "flos": 22310222762880.0, + "grad_norm": 1.6858291597799397, + "language_loss": 0.8538115, + "learning_rate": 1.5157092060833975e-07, + "loss": 0.87590569, + "num_input_tokens_seen": 158243145, + "step": 7310, + "time_per_iteration": 2.6571080684661865 + }, + { + "auxiliary_loss_clip": 0.01101028, + "auxiliary_loss_mlp": 0.01084232, + "balance_loss_clip": 1.02489352, + "balance_loss_mlp": 1.00417066, + "epoch": 0.879095773462394, + "flos": 29310971408640.0, + "grad_norm": 1.6601850792853825, + "language_loss": 0.65857273, + "learning_rate": 1.5127359207221658e-07, + "loss": 0.68042535, + "num_input_tokens_seen": 158262625, + "step": 7311, + "time_per_iteration": 3.761909246444702 + }, + { + "auxiliary_loss_clip": 0.01090106, + "auxiliary_loss_mlp": 0.01083424, + "balance_loss_clip": 1.02183568, + "balance_loss_mlp": 1.00326717, + "epoch": 0.8792160163530331, + "flos": 16690023394560.0, + "grad_norm": 6.5182121826142, + "language_loss": 0.73369741, + "learning_rate": 1.5097654398641923e-07, + "loss": 0.75543267, + "num_input_tokens_seen": 158280530, + "step": 7312, + "time_per_iteration": 2.8083508014678955 + }, + { + "auxiliary_loss_clip": 0.01110104, + "auxiliary_loss_mlp": 0.01084786, + "balance_loss_clip": 1.02537322, + "balance_loss_mlp": 1.00462961, + "epoch": 0.8793362592436722, + "flos": 24499301230080.0, + "grad_norm": 1.3054393880627901, + "language_loss": 0.72959441, + "learning_rate": 1.5067977639601014e-07, + "loss": 0.75154328, + "num_input_tokens_seen": 158303290, + "step": 7313, + "time_per_iteration": 2.64558482170105 + }, + { + "auxiliary_loss_clip": 0.01108341, + "auxiliary_loss_mlp": 0.01084063, + "balance_loss_clip": 1.02293897, + "balance_loss_mlp": 1.00395489, + "epoch": 0.8794565021343113, + "flos": 14538399834240.0, + "grad_norm": 2.2592357388413187, + "language_loss": 0.71136153, + "learning_rate": 1.5038328934600864e-07, + "loss": 0.73328561, + "num_input_tokens_seen": 158319925, + "step": 7314, + "time_per_iteration": 3.6254727840423584 + }, + { + "auxiliary_loss_clip": 0.01114703, + "auxiliary_loss_mlp": 0.01083631, + "balance_loss_clip": 1.02409577, + "balance_loss_mlp": 1.00361812, + "epoch": 0.8795767450249504, + "flos": 39530286224640.0, + "grad_norm": 1.7627482895916426, + "language_loss": 0.69596624, + "learning_rate": 1.5008708288139161e-07, + "loss": 0.71794963, + "num_input_tokens_seen": 158342285, + "step": 7315, + "time_per_iteration": 3.896063804626465 + }, + { + "auxiliary_loss_clip": 0.01121278, + "auxiliary_loss_mlp": 0.01084634, + "balance_loss_clip": 1.02263236, + "balance_loss_mlp": 1.00447726, + "epoch": 0.8796969879155895, + "flos": 22960672197120.0, + "grad_norm": 3.415034066295813, + "language_loss": 0.73640883, + "learning_rate": 1.497911570470931e-07, + "loss": 0.75846797, + "num_input_tokens_seen": 158362290, + "step": 7316, + "time_per_iteration": 2.728214979171753 + }, + { + "auxiliary_loss_clip": 0.01106605, + "auxiliary_loss_mlp": 0.01084519, + "balance_loss_clip": 1.02394009, + "balance_loss_mlp": 1.00445795, + "epoch": 0.8798172308062285, + "flos": 28362427004160.0, + "grad_norm": 1.6638554722594403, + "language_loss": 0.85821861, + "learning_rate": 1.494955118880048e-07, + "loss": 0.88012987, + "num_input_tokens_seen": 158383275, + "step": 7317, + "time_per_iteration": 2.8298792839050293 + }, + { + "auxiliary_loss_clip": 0.01124652, + "auxiliary_loss_mlp": 0.01083075, + "balance_loss_clip": 1.02412593, + "balance_loss_mlp": 1.00310898, + "epoch": 0.8799374736968677, + "flos": 23988974751360.0, + "grad_norm": 1.6696204740322877, + "language_loss": 0.72947341, + "learning_rate": 1.4920014744897634e-07, + "loss": 0.75155067, + "num_input_tokens_seen": 158402690, + "step": 7318, + "time_per_iteration": 2.6558432579040527 + }, + { + "auxiliary_loss_clip": 0.01110203, + "auxiliary_loss_mlp": 0.01083706, + "balance_loss_clip": 1.02421665, + "balance_loss_mlp": 1.00369251, + "epoch": 0.8800577165875068, + "flos": 25630271832960.0, + "grad_norm": 1.6765287206263884, + "language_loss": 0.8635447, + "learning_rate": 1.4890506377481392e-07, + "loss": 0.88548374, + "num_input_tokens_seen": 158421780, + "step": 7319, + "time_per_iteration": 2.9373066425323486 + }, + { + "auxiliary_loss_clip": 0.01068232, + "auxiliary_loss_mlp": 0.01084328, + "balance_loss_clip": 1.01993823, + "balance_loss_mlp": 1.0043149, + "epoch": 0.8801779594781458, + "flos": 23440331439360.0, + "grad_norm": 1.5001553043025633, + "language_loss": 0.63953257, + "learning_rate": 1.486102609102815e-07, + "loss": 0.66105819, + "num_input_tokens_seen": 158442330, + "step": 7320, + "time_per_iteration": 2.7973482608795166 + }, + { + "auxiliary_loss_clip": 0.01107338, + "auxiliary_loss_mlp": 0.01083516, + "balance_loss_clip": 1.02231169, + "balance_loss_mlp": 1.00340688, + "epoch": 0.880298202368785, + "flos": 11508580656000.0, + "grad_norm": 2.6400592173043327, + "language_loss": 0.8571474, + "learning_rate": 1.483157389001004e-07, + "loss": 0.87905592, + "num_input_tokens_seen": 158459890, + "step": 7321, + "time_per_iteration": 2.6824960708618164 + }, + { + "auxiliary_loss_clip": 0.01119086, + "auxiliary_loss_mlp": 0.01084466, + "balance_loss_clip": 1.02529442, + "balance_loss_mlp": 1.00421429, + "epoch": 0.880418445259424, + "flos": 22671447886080.0, + "grad_norm": 2.071499558212094, + "language_loss": 0.78964484, + "learning_rate": 1.4802149778894933e-07, + "loss": 0.81168038, + "num_input_tokens_seen": 158478680, + "step": 7322, + "time_per_iteration": 2.7165396213531494 + }, + { + "auxiliary_loss_clip": 0.01124589, + "auxiliary_loss_mlp": 0.01084554, + "balance_loss_clip": 1.02354085, + "balance_loss_mlp": 1.00454092, + "epoch": 0.8805386881500631, + "flos": 20522158709760.0, + "grad_norm": 1.7553197131796385, + "language_loss": 0.87489343, + "learning_rate": 1.4772753762146484e-07, + "loss": 0.89698488, + "num_input_tokens_seen": 158497935, + "step": 7323, + "time_per_iteration": 2.615859031677246 + }, + { + "auxiliary_loss_clip": 0.01126145, + "auxiliary_loss_mlp": 0.01083912, + "balance_loss_clip": 1.02454805, + "balance_loss_mlp": 1.00375617, + "epoch": 0.8806589310407023, + "flos": 36538891620480.0, + "grad_norm": 1.8137076453815122, + "language_loss": 0.70344853, + "learning_rate": 1.474338584422401e-07, + "loss": 0.7255491, + "num_input_tokens_seen": 158523145, + "step": 7324, + "time_per_iteration": 2.805311441421509 + }, + { + "auxiliary_loss_clip": 0.01123974, + "auxiliary_loss_mlp": 0.01084359, + "balance_loss_clip": 1.02462316, + "balance_loss_mlp": 1.00424993, + "epoch": 0.8807791739313413, + "flos": 23440187784960.0, + "grad_norm": 1.6207649959083537, + "language_loss": 0.75794864, + "learning_rate": 1.4714046029582595e-07, + "loss": 0.78003198, + "num_input_tokens_seen": 158542210, + "step": 7325, + "time_per_iteration": 2.670823335647583 + }, + { + "auxiliary_loss_clip": 0.01107891, + "auxiliary_loss_mlp": 0.01084361, + "balance_loss_clip": 1.02395177, + "balance_loss_mlp": 1.00430024, + "epoch": 0.8808994168219804, + "flos": 25956843310080.0, + "grad_norm": 1.6825588894165753, + "language_loss": 0.75854981, + "learning_rate": 1.46847343226731e-07, + "loss": 0.78047234, + "num_input_tokens_seen": 158563250, + "step": 7326, + "time_per_iteration": 2.816223621368408 + }, + { + "auxiliary_loss_clip": 0.01125892, + "auxiliary_loss_mlp": 0.01083429, + "balance_loss_clip": 1.02477121, + "balance_loss_mlp": 1.00332057, + "epoch": 0.8810196597126195, + "flos": 17092079303040.0, + "grad_norm": 1.9041657352197607, + "language_loss": 0.69612586, + "learning_rate": 1.465545072794203e-07, + "loss": 0.7182191, + "num_input_tokens_seen": 158581125, + "step": 7327, + "time_per_iteration": 2.6845216751098633 + }, + { + "auxiliary_loss_clip": 0.010738, + "auxiliary_loss_mlp": 0.0108503, + "balance_loss_clip": 1.01906407, + "balance_loss_mlp": 1.00501633, + "epoch": 0.8811399026032586, + "flos": 23002831785600.0, + "grad_norm": 1.5704005743887939, + "language_loss": 0.75711679, + "learning_rate": 1.4626195249831774e-07, + "loss": 0.77870512, + "num_input_tokens_seen": 158602025, + "step": 7328, + "time_per_iteration": 2.8387291431427 + }, + { + "auxiliary_loss_clip": 0.01127, + "auxiliary_loss_mlp": 0.0108471, + "balance_loss_clip": 1.02562761, + "balance_loss_mlp": 1.00460148, + "epoch": 0.8812601454938976, + "flos": 14463813242880.0, + "grad_norm": 2.5401897472436614, + "language_loss": 0.72097468, + "learning_rate": 1.4596967892780244e-07, + "loss": 0.7430917, + "num_input_tokens_seen": 158618355, + "step": 7329, + "time_per_iteration": 2.627702474594116 + }, + { + "auxiliary_loss_clip": 0.01134394, + "auxiliary_loss_mlp": 0.0108397, + "balance_loss_clip": 1.02549672, + "balance_loss_mlp": 1.00390863, + "epoch": 0.8813803883845368, + "flos": 22493223578880.0, + "grad_norm": 1.8330117667414212, + "language_loss": 0.74594831, + "learning_rate": 1.4567768661221314e-07, + "loss": 0.76813197, + "num_input_tokens_seen": 158638925, + "step": 7330, + "time_per_iteration": 2.6552653312683105 + }, + { + "auxiliary_loss_clip": 0.01125436, + "auxiliary_loss_mlp": 0.00872929, + "balance_loss_clip": 1.02454758, + "balance_loss_mlp": 1.00010872, + "epoch": 0.8815006312751759, + "flos": 21506901045120.0, + "grad_norm": 1.8967042789606594, + "language_loss": 0.74507535, + "learning_rate": 1.4538597559584442e-07, + "loss": 0.76505899, + "num_input_tokens_seen": 158656715, + "step": 7331, + "time_per_iteration": 2.6664180755615234 + }, + { + "auxiliary_loss_clip": 0.0111746, + "auxiliary_loss_mlp": 0.01084188, + "balance_loss_clip": 1.02517033, + "balance_loss_mlp": 1.00403178, + "epoch": 0.8816208741658149, + "flos": 22784566792320.0, + "grad_norm": 1.785788145912713, + "language_loss": 0.79082525, + "learning_rate": 1.4509454592294823e-07, + "loss": 0.81284171, + "num_input_tokens_seen": 158677200, + "step": 7332, + "time_per_iteration": 2.791940927505493 + }, + { + "auxiliary_loss_clip": 0.01102283, + "auxiliary_loss_mlp": 0.0087295, + "balance_loss_clip": 1.02096081, + "balance_loss_mlp": 1.00008059, + "epoch": 0.8817411170564541, + "flos": 17779409026560.0, + "grad_norm": 1.7812173730906362, + "language_loss": 0.7888757, + "learning_rate": 1.448033976377354e-07, + "loss": 0.80862802, + "num_input_tokens_seen": 158692185, + "step": 7333, + "time_per_iteration": 2.712313413619995 + }, + { + "auxiliary_loss_clip": 0.01126328, + "auxiliary_loss_mlp": 0.01083609, + "balance_loss_clip": 1.02495348, + "balance_loss_mlp": 1.00345254, + "epoch": 0.8818613599470931, + "flos": 18551812112640.0, + "grad_norm": 2.002425247377114, + "language_loss": 0.74102098, + "learning_rate": 1.445125307843713e-07, + "loss": 0.76312035, + "num_input_tokens_seen": 158710410, + "step": 7334, + "time_per_iteration": 2.614471912384033 + }, + { + "auxiliary_loss_clip": 0.0112431, + "auxiliary_loss_mlp": 0.0108395, + "balance_loss_clip": 1.02519464, + "balance_loss_mlp": 1.00384188, + "epoch": 0.8819816028377322, + "flos": 27599792417280.0, + "grad_norm": 1.776943415938679, + "language_loss": 0.76025307, + "learning_rate": 1.442219454069813e-07, + "loss": 0.78233564, + "num_input_tokens_seen": 158731435, + "step": 7335, + "time_per_iteration": 3.4993691444396973 + }, + { + "auxiliary_loss_clip": 0.01091856, + "auxiliary_loss_mlp": 0.01083245, + "balance_loss_clip": 1.02275038, + "balance_loss_mlp": 1.00327921, + "epoch": 0.8821018457283714, + "flos": 23404600385280.0, + "grad_norm": 1.940052006208919, + "language_loss": 0.66108114, + "learning_rate": 1.4393164154964676e-07, + "loss": 0.68283218, + "num_input_tokens_seen": 158750965, + "step": 7336, + "time_per_iteration": 3.6841158866882324 + }, + { + "auxiliary_loss_clip": 0.01120414, + "auxiliary_loss_mlp": 0.01083663, + "balance_loss_clip": 1.02148652, + "balance_loss_mlp": 1.00355482, + "epoch": 0.8822220886190104, + "flos": 29132459792640.0, + "grad_norm": 1.9208264991754025, + "language_loss": 0.94097728, + "learning_rate": 1.4364161925640649e-07, + "loss": 0.96301806, + "num_input_tokens_seen": 158772365, + "step": 7337, + "time_per_iteration": 2.752659320831299 + }, + { + "auxiliary_loss_clip": 0.01134292, + "auxiliary_loss_mlp": 0.01083914, + "balance_loss_clip": 1.0251224, + "balance_loss_mlp": 1.00390089, + "epoch": 0.8823423315096495, + "flos": 20485422074880.0, + "grad_norm": 1.7795715369280565, + "language_loss": 0.84863818, + "learning_rate": 1.4335187857125663e-07, + "loss": 0.87082022, + "num_input_tokens_seen": 158791065, + "step": 7338, + "time_per_iteration": 2.69079852104187 + }, + { + "auxiliary_loss_clip": 0.01127362, + "auxiliary_loss_mlp": 0.01082792, + "balance_loss_clip": 1.02623236, + "balance_loss_mlp": 1.00277901, + "epoch": 0.8824625744002886, + "flos": 24206377818240.0, + "grad_norm": 1.7614861215485387, + "language_loss": 0.75501007, + "learning_rate": 1.4306241953815023e-07, + "loss": 0.77711159, + "num_input_tokens_seen": 158812125, + "step": 7339, + "time_per_iteration": 3.6828222274780273 + }, + { + "auxiliary_loss_clip": 0.01125226, + "auxiliary_loss_mlp": 0.01083171, + "balance_loss_clip": 1.0248065, + "balance_loss_mlp": 1.00306249, + "epoch": 0.8825828172909277, + "flos": 24679500785280.0, + "grad_norm": 2.1068031720175258, + "language_loss": 0.70734024, + "learning_rate": 1.4277324220099862e-07, + "loss": 0.72942424, + "num_input_tokens_seen": 158834035, + "step": 7340, + "time_per_iteration": 2.7699766159057617 + }, + { + "auxiliary_loss_clip": 0.01100953, + "auxiliary_loss_mlp": 0.01084765, + "balance_loss_clip": 1.02334177, + "balance_loss_mlp": 1.00470352, + "epoch": 0.8827030601815667, + "flos": 22456163721600.0, + "grad_norm": 2.2897997693861676, + "language_loss": 0.74226081, + "learning_rate": 1.4248434660366938e-07, + "loss": 0.76411796, + "num_input_tokens_seen": 158853510, + "step": 7341, + "time_per_iteration": 2.759561538696289 + }, + { + "auxiliary_loss_clip": 0.0111482, + "auxiliary_loss_mlp": 0.01082583, + "balance_loss_clip": 1.02386665, + "balance_loss_mlp": 1.00252223, + "epoch": 0.8828233030722058, + "flos": 19865639877120.0, + "grad_norm": 1.746114173896293, + "language_loss": 0.70668113, + "learning_rate": 1.4219573278998808e-07, + "loss": 0.72865522, + "num_input_tokens_seen": 158871970, + "step": 7342, + "time_per_iteration": 3.6688811779022217 + }, + { + "auxiliary_loss_clip": 0.01117769, + "auxiliary_loss_mlp": 0.01083721, + "balance_loss_clip": 1.02466226, + "balance_loss_mlp": 1.00351727, + "epoch": 0.882943545962845, + "flos": 39347213581440.0, + "grad_norm": 1.8610903603536375, + "language_loss": 0.64726102, + "learning_rate": 1.4190740080373685e-07, + "loss": 0.66927594, + "num_input_tokens_seen": 158892250, + "step": 7343, + "time_per_iteration": 2.892775774002075 + }, + { + "auxiliary_loss_clip": 0.01099505, + "auxiliary_loss_mlp": 0.01083367, + "balance_loss_clip": 1.0244863, + "balance_loss_mlp": 1.0032109, + "epoch": 0.883063788853484, + "flos": 19054524908160.0, + "grad_norm": 1.736647068682193, + "language_loss": 0.84334481, + "learning_rate": 1.4161935068865538e-07, + "loss": 0.86517358, + "num_input_tokens_seen": 158907395, + "step": 7344, + "time_per_iteration": 2.7179930210113525 + }, + { + "auxiliary_loss_clip": 0.01134603, + "auxiliary_loss_mlp": 0.01084187, + "balance_loss_clip": 1.02545071, + "balance_loss_mlp": 1.00407827, + "epoch": 0.8831840317441231, + "flos": 18733196816640.0, + "grad_norm": 1.8633612941669795, + "language_loss": 0.75445378, + "learning_rate": 1.4133158248844113e-07, + "loss": 0.77664161, + "num_input_tokens_seen": 158926300, + "step": 7345, + "time_per_iteration": 2.6482691764831543 + }, + { + "auxiliary_loss_clip": 0.01089189, + "auxiliary_loss_mlp": 0.01083705, + "balance_loss_clip": 1.02317142, + "balance_loss_mlp": 1.00354815, + "epoch": 0.8833042746347622, + "flos": 26827712553600.0, + "grad_norm": 1.7388586564952575, + "language_loss": 0.73226327, + "learning_rate": 1.4104409624674785e-07, + "loss": 0.7539922, + "num_input_tokens_seen": 158946085, + "step": 7346, + "time_per_iteration": 2.775470018386841 + }, + { + "auxiliary_loss_clip": 0.01124844, + "auxiliary_loss_mlp": 0.01083562, + "balance_loss_clip": 1.0249759, + "balance_loss_mlp": 1.00345314, + "epoch": 0.8834245175254013, + "flos": 26104077158400.0, + "grad_norm": 1.672424313717034, + "language_loss": 0.78444147, + "learning_rate": 1.407568920071873e-07, + "loss": 0.80652559, + "num_input_tokens_seen": 158964950, + "step": 7347, + "time_per_iteration": 2.7108559608459473 + }, + { + "auxiliary_loss_clip": 0.01137074, + "auxiliary_loss_mlp": 0.01084484, + "balance_loss_clip": 1.02712679, + "balance_loss_mlp": 1.00428033, + "epoch": 0.8835447604160404, + "flos": 30629036977920.0, + "grad_norm": 1.9118734956149517, + "language_loss": 0.68258524, + "learning_rate": 1.4046996981332782e-07, + "loss": 0.70480084, + "num_input_tokens_seen": 158984835, + "step": 7348, + "time_per_iteration": 2.713982105255127 + }, + { + "auxiliary_loss_clip": 0.01089413, + "auxiliary_loss_mlp": 0.01084404, + "balance_loss_clip": 1.02286911, + "balance_loss_mlp": 1.00415194, + "epoch": 0.8836650033066795, + "flos": 24718356322560.0, + "grad_norm": 2.9677041274862948, + "language_loss": 0.78284889, + "learning_rate": 1.4018332970869516e-07, + "loss": 0.80458707, + "num_input_tokens_seen": 159002775, + "step": 7349, + "time_per_iteration": 2.7937490940093994 + }, + { + "auxiliary_loss_clip": 0.01111814, + "auxiliary_loss_mlp": 0.01084001, + "balance_loss_clip": 1.02140939, + "balance_loss_mlp": 1.00379658, + "epoch": 0.8837852461973186, + "flos": 25413371556480.0, + "grad_norm": 1.842364988157827, + "language_loss": 0.85295886, + "learning_rate": 1.398969717367733e-07, + "loss": 0.87491697, + "num_input_tokens_seen": 159024100, + "step": 7350, + "time_per_iteration": 2.748840808868408 + }, + { + "auxiliary_loss_clip": 0.01079302, + "auxiliary_loss_mlp": 0.01084376, + "balance_loss_clip": 1.02365518, + "balance_loss_mlp": 1.00441027, + "epoch": 0.8839054890879576, + "flos": 17822574195840.0, + "grad_norm": 1.572029549460222, + "language_loss": 0.7606889, + "learning_rate": 1.396108959410014e-07, + "loss": 0.78232569, + "num_input_tokens_seen": 159043315, + "step": 7351, + "time_per_iteration": 2.709630012512207 + }, + { + "auxiliary_loss_clip": 0.01125877, + "auxiliary_loss_mlp": 0.00872973, + "balance_loss_clip": 1.02549541, + "balance_loss_mlp": 1.0000701, + "epoch": 0.8840257319785968, + "flos": 23769021818880.0, + "grad_norm": 1.4922632962145623, + "language_loss": 0.81424761, + "learning_rate": 1.3932510236477745e-07, + "loss": 0.83423615, + "num_input_tokens_seen": 159063985, + "step": 7352, + "time_per_iteration": 2.683791399002075 + }, + { + "auxiliary_loss_clip": 0.01126047, + "auxiliary_loss_mlp": 0.01084471, + "balance_loss_clip": 1.02455151, + "balance_loss_mlp": 1.00436282, + "epoch": 0.8841459748692359, + "flos": 29059776622080.0, + "grad_norm": 1.7547288723383774, + "language_loss": 0.5602169, + "learning_rate": 1.3903959105145636e-07, + "loss": 0.58232206, + "num_input_tokens_seen": 159084475, + "step": 7353, + "time_per_iteration": 2.6937294006347656 + }, + { + "auxiliary_loss_clip": 0.01134248, + "auxiliary_loss_mlp": 0.01083792, + "balance_loss_clip": 1.0249126, + "balance_loss_mlp": 1.00373149, + "epoch": 0.8842662177598749, + "flos": 24311523905280.0, + "grad_norm": 1.83902146454551, + "language_loss": 0.82876956, + "learning_rate": 1.387543620443492e-07, + "loss": 0.85095, + "num_input_tokens_seen": 159101320, + "step": 7354, + "time_per_iteration": 2.6405656337738037 + }, + { + "auxiliary_loss_clip": 0.01134043, + "auxiliary_loss_mlp": 0.01083963, + "balance_loss_clip": 1.02525449, + "balance_loss_mlp": 1.00385427, + "epoch": 0.8843864606505141, + "flos": 25007867942400.0, + "grad_norm": 1.523637072907809, + "language_loss": 0.84083253, + "learning_rate": 1.3846941538672606e-07, + "loss": 0.86301261, + "num_input_tokens_seen": 159120025, + "step": 7355, + "time_per_iteration": 2.622246265411377 + }, + { + "auxiliary_loss_clip": 0.01097684, + "auxiliary_loss_mlp": 0.01084114, + "balance_loss_clip": 1.02329075, + "balance_loss_mlp": 1.00405288, + "epoch": 0.8845067035411531, + "flos": 28183915388160.0, + "grad_norm": 2.0024819164847507, + "language_loss": 0.8085072, + "learning_rate": 1.3818475112181193e-07, + "loss": 0.83032513, + "num_input_tokens_seen": 159138820, + "step": 7356, + "time_per_iteration": 2.8833115100860596 + }, + { + "auxiliary_loss_clip": 0.01098825, + "auxiliary_loss_mlp": 0.01082606, + "balance_loss_clip": 1.02450025, + "balance_loss_mlp": 1.00259256, + "epoch": 0.8846269464317922, + "flos": 12853219311360.0, + "grad_norm": 1.985055191648328, + "language_loss": 0.79398388, + "learning_rate": 1.3790036929279091e-07, + "loss": 0.81579816, + "num_input_tokens_seen": 159155975, + "step": 7357, + "time_per_iteration": 2.676302909851074 + }, + { + "auxiliary_loss_clip": 0.0112521, + "auxiliary_loss_mlp": 0.00872804, + "balance_loss_clip": 1.02455938, + "balance_loss_mlp": 1.00005686, + "epoch": 0.8847471893224313, + "flos": 18624351628800.0, + "grad_norm": 1.9950170916494019, + "language_loss": 0.58780545, + "learning_rate": 1.3761626994280363e-07, + "loss": 0.60778558, + "num_input_tokens_seen": 159173445, + "step": 7358, + "time_per_iteration": 2.679361343383789 + }, + { + "auxiliary_loss_clip": 0.01107307, + "auxiliary_loss_mlp": 0.01083766, + "balance_loss_clip": 1.02264714, + "balance_loss_mlp": 1.00375295, + "epoch": 0.8848674322130704, + "flos": 35769433449600.0, + "grad_norm": 1.8328738450338307, + "language_loss": 0.73482919, + "learning_rate": 1.3733245311494735e-07, + "loss": 0.75673985, + "num_input_tokens_seen": 159196100, + "step": 7359, + "time_per_iteration": 2.8818371295928955 + }, + { + "auxiliary_loss_clip": 0.01125286, + "auxiliary_loss_mlp": 0.01084474, + "balance_loss_clip": 1.02493954, + "balance_loss_mlp": 1.00431824, + "epoch": 0.8849876751037095, + "flos": 24243760897920.0, + "grad_norm": 2.2664477476832308, + "language_loss": 0.70797855, + "learning_rate": 1.3704891885227676e-07, + "loss": 0.73007607, + "num_input_tokens_seen": 159216145, + "step": 7360, + "time_per_iteration": 2.7397642135620117 + }, + { + "auxiliary_loss_clip": 0.01109958, + "auxiliary_loss_mlp": 0.01084734, + "balance_loss_clip": 1.02451515, + "balance_loss_mlp": 1.00457823, + "epoch": 0.8851079179943486, + "flos": 21500580251520.0, + "grad_norm": 2.162754677514286, + "language_loss": 0.77942896, + "learning_rate": 1.367656671978037e-07, + "loss": 0.80137587, + "num_input_tokens_seen": 159233610, + "step": 7361, + "time_per_iteration": 3.669032573699951 + }, + { + "auxiliary_loss_clip": 0.0111851, + "auxiliary_loss_mlp": 0.01083828, + "balance_loss_clip": 1.02499151, + "balance_loss_mlp": 1.00376749, + "epoch": 0.8852281608849877, + "flos": 15300711198720.0, + "grad_norm": 1.898945926915598, + "language_loss": 0.7323879, + "learning_rate": 1.36482698194498e-07, + "loss": 0.75441134, + "num_input_tokens_seen": 159250155, + "step": 7362, + "time_per_iteration": 3.543159008026123 + }, + { + "auxiliary_loss_clip": 0.01115717, + "auxiliary_loss_mlp": 0.01084939, + "balance_loss_clip": 1.02382791, + "balance_loss_mlp": 1.00478303, + "epoch": 0.8853484037756267, + "flos": 23295719283840.0, + "grad_norm": 2.371912832154523, + "language_loss": 0.71839237, + "learning_rate": 1.3620001188528506e-07, + "loss": 0.74039888, + "num_input_tokens_seen": 159270875, + "step": 7363, + "time_per_iteration": 2.7253811359405518 + }, + { + "auxiliary_loss_clip": 0.01125027, + "auxiliary_loss_mlp": 0.01084157, + "balance_loss_clip": 1.02450299, + "balance_loss_mlp": 1.00395334, + "epoch": 0.8854686466662659, + "flos": 25114773795840.0, + "grad_norm": 2.3624376509886367, + "language_loss": 0.73292243, + "learning_rate": 1.3591760831304865e-07, + "loss": 0.7550143, + "num_input_tokens_seen": 159288565, + "step": 7364, + "time_per_iteration": 2.6909046173095703 + }, + { + "auxiliary_loss_clip": 0.01133759, + "auxiliary_loss_mlp": 0.01084241, + "balance_loss_clip": 1.02488708, + "balance_loss_mlp": 1.00408423, + "epoch": 0.885588889556905, + "flos": 21390873137280.0, + "grad_norm": 1.5863858248022984, + "language_loss": 0.79054117, + "learning_rate": 1.356354875206287e-07, + "loss": 0.81272113, + "num_input_tokens_seen": 159306400, + "step": 7365, + "time_per_iteration": 3.556037664413452 + }, + { + "auxiliary_loss_clip": 0.01103862, + "auxiliary_loss_mlp": 0.01084098, + "balance_loss_clip": 1.0229367, + "balance_loss_mlp": 1.00394154, + "epoch": 0.885709132447544, + "flos": 26906752431360.0, + "grad_norm": 4.19451053186545, + "language_loss": 0.69770151, + "learning_rate": 1.3535364955082296e-07, + "loss": 0.71958107, + "num_input_tokens_seen": 159326250, + "step": 7366, + "time_per_iteration": 2.814340114593506 + }, + { + "auxiliary_loss_clip": 0.01134765, + "auxiliary_loss_mlp": 0.01084515, + "balance_loss_clip": 1.02593541, + "balance_loss_mlp": 1.00445449, + "epoch": 0.8858293753381832, + "flos": 26103394800000.0, + "grad_norm": 1.67977087805043, + "language_loss": 0.64663422, + "learning_rate": 1.3507209444638613e-07, + "loss": 0.66882706, + "num_input_tokens_seen": 159348250, + "step": 7367, + "time_per_iteration": 3.625657796859741 + }, + { + "auxiliary_loss_clip": 0.01124593, + "auxiliary_loss_mlp": 0.01084577, + "balance_loss_clip": 1.02452171, + "balance_loss_mlp": 1.0044204, + "epoch": 0.8859496182288222, + "flos": 23292810282240.0, + "grad_norm": 1.8991933651735506, + "language_loss": 0.73934513, + "learning_rate": 1.347908222500298e-07, + "loss": 0.76143682, + "num_input_tokens_seen": 159368325, + "step": 7368, + "time_per_iteration": 2.680145740509033 + }, + { + "auxiliary_loss_clip": 0.011016, + "auxiliary_loss_mlp": 0.01085256, + "balance_loss_clip": 1.02037835, + "balance_loss_mlp": 1.00514781, + "epoch": 0.8860698611194613, + "flos": 16872916469760.0, + "grad_norm": 1.8250478313084324, + "language_loss": 0.69667143, + "learning_rate": 1.3450983300442276e-07, + "loss": 0.71854001, + "num_input_tokens_seen": 159387555, + "step": 7369, + "time_per_iteration": 2.797991991043091 + }, + { + "auxiliary_loss_clip": 0.01126193, + "auxiliary_loss_mlp": 0.0108329, + "balance_loss_clip": 1.02532744, + "balance_loss_mlp": 1.00318122, + "epoch": 0.8861901040101005, + "flos": 24681404206080.0, + "grad_norm": 1.885501319935962, + "language_loss": 0.73795438, + "learning_rate": 1.3422912675219068e-07, + "loss": 0.76004922, + "num_input_tokens_seen": 159407310, + "step": 7370, + "time_per_iteration": 2.738903284072876 + }, + { + "auxiliary_loss_clip": 0.0113513, + "auxiliary_loss_mlp": 0.01083538, + "balance_loss_clip": 1.02643538, + "balance_loss_mlp": 1.00347662, + "epoch": 0.8863103469007395, + "flos": 24423026699520.0, + "grad_norm": 1.4756076160846998, + "language_loss": 0.7908932, + "learning_rate": 1.339487035359166e-07, + "loss": 0.81307989, + "num_input_tokens_seen": 159427680, + "step": 7371, + "time_per_iteration": 2.7439968585968018 + }, + { + "auxiliary_loss_clip": 0.01115912, + "auxiliary_loss_mlp": 0.00872744, + "balance_loss_clip": 1.02426386, + "balance_loss_mlp": 1.00015485, + "epoch": 0.8864305897913786, + "flos": 22053964158720.0, + "grad_norm": 1.5202729086906606, + "language_loss": 0.84715557, + "learning_rate": 1.336685633981409e-07, + "loss": 0.86704218, + "num_input_tokens_seen": 159448765, + "step": 7372, + "time_per_iteration": 2.745509147644043 + }, + { + "auxiliary_loss_clip": 0.01126535, + "auxiliary_loss_mlp": 0.01085146, + "balance_loss_clip": 1.02579999, + "balance_loss_mlp": 1.0049901, + "epoch": 0.8865508326820177, + "flos": 19099449843840.0, + "grad_norm": 1.8828301589157754, + "language_loss": 0.74715698, + "learning_rate": 1.333887063813597e-07, + "loss": 0.76927388, + "num_input_tokens_seen": 159466870, + "step": 7373, + "time_per_iteration": 2.688581943511963 + }, + { + "auxiliary_loss_clip": 0.01116668, + "auxiliary_loss_mlp": 0.01084296, + "balance_loss_clip": 1.02446675, + "balance_loss_mlp": 1.00423515, + "epoch": 0.8866710755726568, + "flos": 15414189240960.0, + "grad_norm": 1.875036499386425, + "language_loss": 0.66260374, + "learning_rate": 1.331091325280278e-07, + "loss": 0.68461335, + "num_input_tokens_seen": 159485840, + "step": 7374, + "time_per_iteration": 2.6980550289154053 + }, + { + "auxiliary_loss_clip": 0.01092449, + "auxiliary_loss_mlp": 0.01084592, + "balance_loss_clip": 1.01973116, + "balance_loss_mlp": 1.00434053, + "epoch": 0.8867913184632958, + "flos": 20083689388800.0, + "grad_norm": 1.5889079754965918, + "language_loss": 0.78423548, + "learning_rate": 1.3282984188055625e-07, + "loss": 0.80600584, + "num_input_tokens_seen": 159505630, + "step": 7375, + "time_per_iteration": 2.803844690322876 + }, + { + "auxiliary_loss_clip": 0.01135061, + "auxiliary_loss_mlp": 0.01083567, + "balance_loss_clip": 1.02558386, + "balance_loss_mlp": 1.00345838, + "epoch": 0.8869115613539349, + "flos": 23365852588800.0, + "grad_norm": 1.7135300293183642, + "language_loss": 0.79622877, + "learning_rate": 1.3255083448131288e-07, + "loss": 0.81841505, + "num_input_tokens_seen": 159524675, + "step": 7376, + "time_per_iteration": 2.621082067489624 + }, + { + "auxiliary_loss_clip": 0.01125695, + "auxiliary_loss_mlp": 0.01083142, + "balance_loss_clip": 1.02429461, + "balance_loss_mlp": 1.00308156, + "epoch": 0.8870318042445741, + "flos": 21286840371840.0, + "grad_norm": 1.9906928352338218, + "language_loss": 0.79069918, + "learning_rate": 1.3227211037262365e-07, + "loss": 0.81278759, + "num_input_tokens_seen": 159541915, + "step": 7377, + "time_per_iteration": 2.8138222694396973 + }, + { + "auxiliary_loss_clip": 0.01082539, + "auxiliary_loss_mlp": 0.01084031, + "balance_loss_clip": 1.02331984, + "balance_loss_mlp": 1.00392199, + "epoch": 0.8871520471352131, + "flos": 20010862563840.0, + "grad_norm": 1.8954198566916773, + "language_loss": 0.85461503, + "learning_rate": 1.319936695967696e-07, + "loss": 0.87628078, + "num_input_tokens_seen": 159559740, + "step": 7378, + "time_per_iteration": 2.810009717941284 + }, + { + "auxiliary_loss_clip": 0.0113585, + "auxiliary_loss_mlp": 0.0108403, + "balance_loss_clip": 1.02556515, + "balance_loss_mlp": 1.0037781, + "epoch": 0.8872722900258522, + "flos": 22601422321920.0, + "grad_norm": 2.2495078966172453, + "language_loss": 0.82290733, + "learning_rate": 1.3171551219599097e-07, + "loss": 0.84510618, + "num_input_tokens_seen": 159578265, + "step": 7379, + "time_per_iteration": 2.6146469116210938 + }, + { + "auxiliary_loss_clip": 0.01136165, + "auxiliary_loss_mlp": 0.01084421, + "balance_loss_clip": 1.02738059, + "balance_loss_mlp": 1.00435972, + "epoch": 0.8873925329164913, + "flos": 22163276223360.0, + "grad_norm": 2.9493340036633664, + "language_loss": 0.78122962, + "learning_rate": 1.3143763821248377e-07, + "loss": 0.80343544, + "num_input_tokens_seen": 159595350, + "step": 7380, + "time_per_iteration": 2.6222667694091797 + }, + { + "auxiliary_loss_clip": 0.01134675, + "auxiliary_loss_mlp": 0.01083441, + "balance_loss_clip": 1.02552021, + "balance_loss_mlp": 1.0033797, + "epoch": 0.8875127758071304, + "flos": 19208223204480.0, + "grad_norm": 1.987160827412374, + "language_loss": 0.72359288, + "learning_rate": 1.3116004768840118e-07, + "loss": 0.74577403, + "num_input_tokens_seen": 159613725, + "step": 7381, + "time_per_iteration": 2.717419147491455 + }, + { + "auxiliary_loss_clip": 0.01134039, + "auxiliary_loss_mlp": 0.0108455, + "balance_loss_clip": 1.02505851, + "balance_loss_mlp": 1.00439417, + "epoch": 0.8876330186977694, + "flos": 18110900666880.0, + "grad_norm": 1.5709451091214874, + "language_loss": 0.74097168, + "learning_rate": 1.3088274066585348e-07, + "loss": 0.76315761, + "num_input_tokens_seen": 159631335, + "step": 7382, + "time_per_iteration": 2.6293652057647705 + }, + { + "auxiliary_loss_clip": 0.01107704, + "auxiliary_loss_mlp": 0.01084603, + "balance_loss_clip": 1.02319503, + "balance_loss_mlp": 1.00449443, + "epoch": 0.8877532615884086, + "flos": 22009434272640.0, + "grad_norm": 2.0560016265616605, + "language_loss": 0.90465653, + "learning_rate": 1.3060571718690749e-07, + "loss": 0.92657959, + "num_input_tokens_seen": 159648830, + "step": 7383, + "time_per_iteration": 2.7398135662078857 + }, + { + "auxiliary_loss_clip": 0.01087228, + "auxiliary_loss_mlp": 0.00872838, + "balance_loss_clip": 1.01648521, + "balance_loss_mlp": 1.00111914, + "epoch": 0.8878735044790477, + "flos": 72136924346880.0, + "grad_norm": 0.748076059535806, + "language_loss": 0.56954998, + "learning_rate": 1.3032897729358805e-07, + "loss": 0.58915067, + "num_input_tokens_seen": 159709785, + "step": 7384, + "time_per_iteration": 3.3640451431274414 + }, + { + "auxiliary_loss_clip": 0.0108884, + "auxiliary_loss_mlp": 0.00872974, + "balance_loss_clip": 1.02240133, + "balance_loss_mlp": 1.00005364, + "epoch": 0.8879937473696867, + "flos": 27526355061120.0, + "grad_norm": 2.0246583608121855, + "language_loss": 0.80118227, + "learning_rate": 1.3005252102787645e-07, + "loss": 0.82080036, + "num_input_tokens_seen": 159728725, + "step": 7385, + "time_per_iteration": 2.866853952407837 + }, + { + "auxiliary_loss_clip": 0.01125658, + "auxiliary_loss_mlp": 0.01085222, + "balance_loss_clip": 1.02456641, + "balance_loss_mlp": 1.00506532, + "epoch": 0.8881139902603259, + "flos": 22234091886720.0, + "grad_norm": 1.5579572736951304, + "language_loss": 0.73710793, + "learning_rate": 1.297763484317105e-07, + "loss": 0.75921667, + "num_input_tokens_seen": 159747020, + "step": 7386, + "time_per_iteration": 3.525916337966919 + }, + { + "auxiliary_loss_clip": 0.01082918, + "auxiliary_loss_mlp": 0.0087293, + "balance_loss_clip": 1.02350473, + "balance_loss_mlp": 1.00009203, + "epoch": 0.888234233150965, + "flos": 20299548170880.0, + "grad_norm": 30.915905107179533, + "language_loss": 0.70389026, + "learning_rate": 1.2950045954698551e-07, + "loss": 0.72344875, + "num_input_tokens_seen": 159764855, + "step": 7387, + "time_per_iteration": 3.6893324851989746 + }, + { + "auxiliary_loss_clip": 0.01106988, + "auxiliary_loss_mlp": 0.01083593, + "balance_loss_clip": 1.02393913, + "balance_loss_mlp": 1.00362778, + "epoch": 0.888354476041604, + "flos": 18147996437760.0, + "grad_norm": 1.5052789534461894, + "language_loss": 0.75379962, + "learning_rate": 1.2922485441555343e-07, + "loss": 0.77570546, + "num_input_tokens_seen": 159783935, + "step": 7388, + "time_per_iteration": 2.7074358463287354 + }, + { + "auxiliary_loss_clip": 0.01134591, + "auxiliary_loss_mlp": 0.01083396, + "balance_loss_clip": 1.0251832, + "balance_loss_mlp": 1.00328779, + "epoch": 0.8884747189322432, + "flos": 22014282608640.0, + "grad_norm": 1.6294905711659244, + "language_loss": 0.81769991, + "learning_rate": 1.2894953307922363e-07, + "loss": 0.83987975, + "num_input_tokens_seen": 159802895, + "step": 7389, + "time_per_iteration": 2.6175129413604736 + }, + { + "auxiliary_loss_clip": 0.01104995, + "auxiliary_loss_mlp": 0.01085162, + "balance_loss_clip": 1.02263236, + "balance_loss_mlp": 1.00495791, + "epoch": 0.8885949618228822, + "flos": 19786779567360.0, + "grad_norm": 1.785570760511726, + "language_loss": 0.83920366, + "learning_rate": 1.2867449557976208e-07, + "loss": 0.8611052, + "num_input_tokens_seen": 159820995, + "step": 7390, + "time_per_iteration": 2.7484424114227295 + }, + { + "auxiliary_loss_clip": 0.01125489, + "auxiliary_loss_mlp": 0.01084075, + "balance_loss_clip": 1.02478743, + "balance_loss_mlp": 1.00396609, + "epoch": 0.8887152047135213, + "flos": 20047599198720.0, + "grad_norm": 2.7294875927365734, + "language_loss": 0.75380045, + "learning_rate": 1.283997419588916e-07, + "loss": 0.77589607, + "num_input_tokens_seen": 159840465, + "step": 7391, + "time_per_iteration": 3.580791711807251 + }, + { + "auxiliary_loss_clip": 0.01126754, + "auxiliary_loss_mlp": 0.01084171, + "balance_loss_clip": 1.02548432, + "balance_loss_mlp": 1.00406253, + "epoch": 0.8888354476041604, + "flos": 18588117784320.0, + "grad_norm": 1.8502380216820538, + "language_loss": 0.6183942, + "learning_rate": 1.2812527225829216e-07, + "loss": 0.64050353, + "num_input_tokens_seen": 159858690, + "step": 7392, + "time_per_iteration": 2.6632020473480225 + }, + { + "auxiliary_loss_clip": 0.01126123, + "auxiliary_loss_mlp": 0.01084394, + "balance_loss_clip": 1.02478194, + "balance_loss_mlp": 1.00419021, + "epoch": 0.8889556904947995, + "flos": 21689794120320.0, + "grad_norm": 1.8964936762660827, + "language_loss": 0.76567769, + "learning_rate": 1.2785108651960052e-07, + "loss": 0.78778291, + "num_input_tokens_seen": 159880325, + "step": 7393, + "time_per_iteration": 3.638686180114746 + }, + { + "auxiliary_loss_clip": 0.01126321, + "auxiliary_loss_mlp": 0.01084837, + "balance_loss_clip": 1.02506936, + "balance_loss_mlp": 1.00468087, + "epoch": 0.8890759333854386, + "flos": 27381204201600.0, + "grad_norm": 1.949301090807417, + "language_loss": 0.80499429, + "learning_rate": 1.2757718478441094e-07, + "loss": 0.82710588, + "num_input_tokens_seen": 159901070, + "step": 7394, + "time_per_iteration": 2.693535327911377 + }, + { + "auxiliary_loss_clip": 0.01116588, + "auxiliary_loss_mlp": 0.01084213, + "balance_loss_clip": 1.02419567, + "balance_loss_mlp": 1.00415206, + "epoch": 0.8891961762760777, + "flos": 24498834353280.0, + "grad_norm": 1.8910942459771432, + "language_loss": 0.7709347, + "learning_rate": 1.2730356709427302e-07, + "loss": 0.79294276, + "num_input_tokens_seen": 159919750, + "step": 7395, + "time_per_iteration": 2.7385263442993164 + }, + { + "auxiliary_loss_clip": 0.01120506, + "auxiliary_loss_mlp": 0.01083921, + "balance_loss_clip": 1.02194428, + "balance_loss_mlp": 1.00385976, + "epoch": 0.8893164191667168, + "flos": 41499770895360.0, + "grad_norm": 1.77395731601319, + "language_loss": 0.59470737, + "learning_rate": 1.2703023349069542e-07, + "loss": 0.61675167, + "num_input_tokens_seen": 159944600, + "step": 7396, + "time_per_iteration": 2.8397305011749268 + }, + { + "auxiliary_loss_clip": 0.0112406, + "auxiliary_loss_mlp": 0.01083513, + "balance_loss_clip": 1.02422893, + "balance_loss_mlp": 1.0033567, + "epoch": 0.8894366620573558, + "flos": 33583623120000.0, + "grad_norm": 1.6791625280207014, + "language_loss": 0.61486787, + "learning_rate": 1.2675718401514223e-07, + "loss": 0.63694358, + "num_input_tokens_seen": 159968780, + "step": 7397, + "time_per_iteration": 2.8087968826293945 + }, + { + "auxiliary_loss_clip": 0.01116746, + "auxiliary_loss_mlp": 0.01084166, + "balance_loss_clip": 1.02518427, + "balance_loss_mlp": 1.00400996, + "epoch": 0.889556904947995, + "flos": 16909832672640.0, + "grad_norm": 1.9227302265072288, + "language_loss": 0.74511886, + "learning_rate": 1.264844187090346e-07, + "loss": 0.76712787, + "num_input_tokens_seen": 159985905, + "step": 7398, + "time_per_iteration": 2.695249319076538 + }, + { + "auxiliary_loss_clip": 0.01117844, + "auxiliary_loss_mlp": 0.01083258, + "balance_loss_clip": 1.02477431, + "balance_loss_mlp": 1.0032444, + "epoch": 0.889677147838634, + "flos": 26030855283840.0, + "grad_norm": 1.7919444433377067, + "language_loss": 0.75020498, + "learning_rate": 1.262119376137516e-07, + "loss": 0.77221596, + "num_input_tokens_seen": 160006965, + "step": 7399, + "time_per_iteration": 2.764982223510742 + }, + { + "auxiliary_loss_clip": 0.01125256, + "auxiliary_loss_mlp": 0.01083434, + "balance_loss_clip": 1.02458942, + "balance_loss_mlp": 1.00342119, + "epoch": 0.8897973907292731, + "flos": 26468283110400.0, + "grad_norm": 1.7173276076847575, + "language_loss": 0.84836495, + "learning_rate": 1.2593974077062707e-07, + "loss": 0.87045181, + "num_input_tokens_seen": 160028585, + "step": 7400, + "time_per_iteration": 2.701789617538452 + }, + { + "auxiliary_loss_clip": 0.01099586, + "auxiliary_loss_mlp": 0.01083594, + "balance_loss_clip": 1.0229075, + "balance_loss_mlp": 1.00348568, + "epoch": 0.8899176336199123, + "flos": 26249694894720.0, + "grad_norm": 1.7449983105477296, + "language_loss": 0.63771379, + "learning_rate": 1.2566782822095423e-07, + "loss": 0.6595456, + "num_input_tokens_seen": 160048840, + "step": 7401, + "time_per_iteration": 2.8105456829071045 + }, + { + "auxiliary_loss_clip": 0.01107725, + "auxiliary_loss_mlp": 0.01085479, + "balance_loss_clip": 1.02440405, + "balance_loss_mlp": 1.00532317, + "epoch": 0.8900378765105513, + "flos": 20811742156800.0, + "grad_norm": 1.7273753722380893, + "language_loss": 0.71425962, + "learning_rate": 1.2539620000598162e-07, + "loss": 0.73619169, + "num_input_tokens_seen": 160068175, + "step": 7402, + "time_per_iteration": 2.738957643508911 + }, + { + "auxiliary_loss_clip": 0.01134133, + "auxiliary_loss_mlp": 0.01083613, + "balance_loss_clip": 1.02513242, + "balance_loss_mlp": 1.00350451, + "epoch": 0.8901581194011904, + "flos": 16472333018880.0, + "grad_norm": 1.7313184041644138, + "language_loss": 0.79834741, + "learning_rate": 1.2512485616691492e-07, + "loss": 0.82052487, + "num_input_tokens_seen": 160085230, + "step": 7403, + "time_per_iteration": 2.615013360977173 + }, + { + "auxiliary_loss_clip": 0.01110935, + "auxiliary_loss_mlp": 0.01085136, + "balance_loss_clip": 1.02570152, + "balance_loss_mlp": 1.00502777, + "epoch": 0.8902783622918296, + "flos": 35155253773440.0, + "grad_norm": 1.955527033339486, + "language_loss": 0.80910188, + "learning_rate": 1.2485379674491681e-07, + "loss": 0.83106256, + "num_input_tokens_seen": 160111425, + "step": 7404, + "time_per_iteration": 2.9563136100769043 + }, + { + "auxiliary_loss_clip": 0.01115324, + "auxiliary_loss_mlp": 0.01083997, + "balance_loss_clip": 1.02437568, + "balance_loss_mlp": 1.00384116, + "epoch": 0.8903986051824686, + "flos": 17201068145280.0, + "grad_norm": 2.1276594657116052, + "language_loss": 0.79069829, + "learning_rate": 1.2458302178110657e-07, + "loss": 0.81269151, + "num_input_tokens_seen": 160129790, + "step": 7405, + "time_per_iteration": 2.670147657394409 + }, + { + "auxiliary_loss_clip": 0.01105547, + "auxiliary_loss_mlp": 0.01084019, + "balance_loss_clip": 1.02309382, + "balance_loss_mlp": 1.00395775, + "epoch": 0.8905188480731077, + "flos": 25483863997440.0, + "grad_norm": 3.1781116272875414, + "language_loss": 0.82189381, + "learning_rate": 1.2431253131656118e-07, + "loss": 0.84378952, + "num_input_tokens_seen": 160149265, + "step": 7406, + "time_per_iteration": 2.823695421218872 + }, + { + "auxiliary_loss_clip": 0.01107579, + "auxiliary_loss_mlp": 0.01083354, + "balance_loss_clip": 1.02259803, + "balance_loss_mlp": 1.00314975, + "epoch": 0.8906390909637467, + "flos": 23365888502400.0, + "grad_norm": 1.9210072123180972, + "language_loss": 0.76290905, + "learning_rate": 1.240423253923133e-07, + "loss": 0.78481835, + "num_input_tokens_seen": 160168870, + "step": 7407, + "time_per_iteration": 2.699687957763672 + }, + { + "auxiliary_loss_clip": 0.01128122, + "auxiliary_loss_mlp": 0.01084352, + "balance_loss_clip": 1.02633643, + "balance_loss_mlp": 1.00419593, + "epoch": 0.8907593338543859, + "flos": 21068790860160.0, + "grad_norm": 1.721839700906193, + "language_loss": 0.69350851, + "learning_rate": 1.237724040493533e-07, + "loss": 0.71563321, + "num_input_tokens_seen": 160187495, + "step": 7408, + "time_per_iteration": 2.668766736984253 + }, + { + "auxiliary_loss_clip": 0.01136032, + "auxiliary_loss_mlp": 0.01084231, + "balance_loss_clip": 1.02695847, + "balance_loss_mlp": 1.00397968, + "epoch": 0.8908795767450249, + "flos": 21869562712320.0, + "grad_norm": 2.5189580345734828, + "language_loss": 0.73264998, + "learning_rate": 1.2350276732862773e-07, + "loss": 0.75485265, + "num_input_tokens_seen": 160208520, + "step": 7409, + "time_per_iteration": 2.6252739429473877 + }, + { + "auxiliary_loss_clip": 0.01104697, + "auxiliary_loss_mlp": 0.01079339, + "balance_loss_clip": 1.01745427, + "balance_loss_mlp": 1.00037515, + "epoch": 0.890999819635664, + "flos": 66307869348480.0, + "grad_norm": 0.8289682255646342, + "language_loss": 0.56672817, + "learning_rate": 1.2323341527103993e-07, + "loss": 0.58856851, + "num_input_tokens_seen": 160263720, + "step": 7410, + "time_per_iteration": 3.172879934310913 + }, + { + "auxiliary_loss_clip": 0.01134079, + "auxiliary_loss_mlp": 0.01083786, + "balance_loss_clip": 1.02473593, + "balance_loss_mlp": 1.00367701, + "epoch": 0.8911200625263032, + "flos": 26869908055680.0, + "grad_norm": 2.6021789378901707, + "language_loss": 0.84673536, + "learning_rate": 1.2296434791745135e-07, + "loss": 0.86891401, + "num_input_tokens_seen": 160282170, + "step": 7411, + "time_per_iteration": 2.614182949066162 + }, + { + "auxiliary_loss_clip": 0.01125283, + "auxiliary_loss_mlp": 0.01083769, + "balance_loss_clip": 1.02465677, + "balance_loss_mlp": 1.00356483, + "epoch": 0.8912403054169422, + "flos": 20885825957760.0, + "grad_norm": 1.6465175605016753, + "language_loss": 0.76752281, + "learning_rate": 1.2269556530867875e-07, + "loss": 0.78961337, + "num_input_tokens_seen": 160300725, + "step": 7412, + "time_per_iteration": 3.4425387382507324 + }, + { + "auxiliary_loss_clip": 0.01136696, + "auxiliary_loss_mlp": 0.01085033, + "balance_loss_clip": 1.02664328, + "balance_loss_mlp": 1.00473404, + "epoch": 0.8913605483075813, + "flos": 27016567286400.0, + "grad_norm": 1.9671266801339933, + "language_loss": 0.82181185, + "learning_rate": 1.2242706748549614e-07, + "loss": 0.84402913, + "num_input_tokens_seen": 160318720, + "step": 7413, + "time_per_iteration": 3.6737358570098877 + }, + { + "auxiliary_loss_clip": 0.01117962, + "auxiliary_loss_mlp": 0.01083232, + "balance_loss_clip": 1.02452612, + "balance_loss_mlp": 1.00312304, + "epoch": 0.8914807911982204, + "flos": 23621500661760.0, + "grad_norm": 1.9397225924719153, + "language_loss": 0.81941462, + "learning_rate": 1.2215885448863473e-07, + "loss": 0.84142661, + "num_input_tokens_seen": 160339595, + "step": 7414, + "time_per_iteration": 2.810434341430664 + }, + { + "auxiliary_loss_clip": 0.01115547, + "auxiliary_loss_mlp": 0.01084663, + "balance_loss_clip": 1.0243715, + "balance_loss_mlp": 1.00460255, + "epoch": 0.8916010340888595, + "flos": 24462277286400.0, + "grad_norm": 1.63687990783397, + "language_loss": 0.80377698, + "learning_rate": 1.2189092635878152e-07, + "loss": 0.82577908, + "num_input_tokens_seen": 160361045, + "step": 7415, + "time_per_iteration": 2.8760907649993896 + }, + { + "auxiliary_loss_clip": 0.01098211, + "auxiliary_loss_mlp": 0.01083783, + "balance_loss_clip": 1.02227557, + "balance_loss_mlp": 1.00362694, + "epoch": 0.8917212769794985, + "flos": 21215773313280.0, + "grad_norm": 1.983721351037304, + "language_loss": 0.77189529, + "learning_rate": 1.216232831365822e-07, + "loss": 0.79371524, + "num_input_tokens_seen": 160379990, + "step": 7416, + "time_per_iteration": 2.7790942192077637 + }, + { + "auxiliary_loss_clip": 0.01117546, + "auxiliary_loss_mlp": 0.01083266, + "balance_loss_clip": 1.02486777, + "balance_loss_mlp": 1.00320506, + "epoch": 0.8918415198701377, + "flos": 25513992529920.0, + "grad_norm": 1.751292265182139, + "language_loss": 0.80877817, + "learning_rate": 1.2135592486263678e-07, + "loss": 0.83078629, + "num_input_tokens_seen": 160399240, + "step": 7417, + "time_per_iteration": 3.657365560531616 + }, + { + "auxiliary_loss_clip": 0.0111312, + "auxiliary_loss_mlp": 0.01084212, + "balance_loss_clip": 1.02185094, + "balance_loss_mlp": 1.00415087, + "epoch": 0.8919617627607768, + "flos": 37853006693760.0, + "grad_norm": 1.6958969931668848, + "language_loss": 0.61015785, + "learning_rate": 1.2108885157750415e-07, + "loss": 0.63213116, + "num_input_tokens_seen": 160421600, + "step": 7418, + "time_per_iteration": 3.7901906967163086 + }, + { + "auxiliary_loss_clip": 0.01104061, + "auxiliary_loss_mlp": 0.00872818, + "balance_loss_clip": 1.02295744, + "balance_loss_mlp": 1.00015688, + "epoch": 0.8920820056514158, + "flos": 26213676531840.0, + "grad_norm": 2.2181550477430267, + "language_loss": 0.80306542, + "learning_rate": 1.2082206332169897e-07, + "loss": 0.82283419, + "num_input_tokens_seen": 160441695, + "step": 7419, + "time_per_iteration": 2.777230739593506 + }, + { + "auxiliary_loss_clip": 0.01114731, + "auxiliary_loss_mlp": 0.01083259, + "balance_loss_clip": 1.02374887, + "balance_loss_mlp": 1.00324607, + "epoch": 0.892202248542055, + "flos": 17383135207680.0, + "grad_norm": 2.877671147086002, + "language_loss": 0.73443538, + "learning_rate": 1.2055556013569225e-07, + "loss": 0.75641525, + "num_input_tokens_seen": 160457205, + "step": 7420, + "time_per_iteration": 2.665114402770996 + }, + { + "auxiliary_loss_clip": 0.01098629, + "auxiliary_loss_mlp": 0.01084627, + "balance_loss_clip": 1.02410543, + "balance_loss_mlp": 1.00456655, + "epoch": 0.892322491432694, + "flos": 21324223451520.0, + "grad_norm": 1.8238653076392426, + "language_loss": 0.82048571, + "learning_rate": 1.2028934205991315e-07, + "loss": 0.8423183, + "num_input_tokens_seen": 160476525, + "step": 7421, + "time_per_iteration": 2.7231948375701904 + }, + { + "auxiliary_loss_clip": 0.01125628, + "auxiliary_loss_mlp": 0.01084053, + "balance_loss_clip": 1.02487373, + "balance_loss_mlp": 1.00389647, + "epoch": 0.8924427343233331, + "flos": 24029374573440.0, + "grad_norm": 1.3745914119371108, + "language_loss": 0.76868927, + "learning_rate": 1.2002340913474607e-07, + "loss": 0.79078615, + "num_input_tokens_seen": 160500160, + "step": 7422, + "time_per_iteration": 2.7578184604644775 + }, + { + "auxiliary_loss_clip": 0.01135607, + "auxiliary_loss_mlp": 0.01083602, + "balance_loss_clip": 1.02623391, + "balance_loss_mlp": 1.00349367, + "epoch": 0.8925629772139723, + "flos": 30008069631360.0, + "grad_norm": 2.044196753445785, + "language_loss": 0.73938346, + "learning_rate": 1.1975776140053317e-07, + "loss": 0.76157558, + "num_input_tokens_seen": 160520130, + "step": 7423, + "time_per_iteration": 2.7006781101226807 + }, + { + "auxiliary_loss_clip": 0.01098689, + "auxiliary_loss_mlp": 0.01085036, + "balance_loss_clip": 1.02385259, + "balance_loss_mlp": 1.00487947, + "epoch": 0.8926832201046113, + "flos": 22601709630720.0, + "grad_norm": 2.068984702400902, + "language_loss": 0.73293698, + "learning_rate": 1.194923988975729e-07, + "loss": 0.75477421, + "num_input_tokens_seen": 160539730, + "step": 7424, + "time_per_iteration": 2.8022103309631348 + }, + { + "auxiliary_loss_clip": 0.01104509, + "auxiliary_loss_mlp": 0.01083282, + "balance_loss_clip": 1.02274668, + "balance_loss_mlp": 1.00298274, + "epoch": 0.8928034629952504, + "flos": 13297722117120.0, + "grad_norm": 2.3651278551637165, + "language_loss": 0.73694777, + "learning_rate": 1.192273216661206e-07, + "loss": 0.75882572, + "num_input_tokens_seen": 160557820, + "step": 7425, + "time_per_iteration": 2.726548671722412 + }, + { + "auxiliary_loss_clip": 0.01071988, + "auxiliary_loss_mlp": 0.01079102, + "balance_loss_clip": 1.01777625, + "balance_loss_mlp": 1.00013769, + "epoch": 0.8929237058858895, + "flos": 54854556744960.0, + "grad_norm": 0.7641284481057474, + "language_loss": 0.57549167, + "learning_rate": 1.189625297463881e-07, + "loss": 0.59700257, + "num_input_tokens_seen": 160619510, + "step": 7426, + "time_per_iteration": 3.4235517978668213 + }, + { + "auxiliary_loss_clip": 0.01082948, + "auxiliary_loss_mlp": 0.01083686, + "balance_loss_clip": 1.0219233, + "balance_loss_mlp": 1.00357771, + "epoch": 0.8930439487765286, + "flos": 28883850785280.0, + "grad_norm": 1.6073911773159644, + "language_loss": 0.7924422, + "learning_rate": 1.1869802317854394e-07, + "loss": 0.81410861, + "num_input_tokens_seen": 160643295, + "step": 7427, + "time_per_iteration": 3.1273117065429688 + }, + { + "auxiliary_loss_clip": 0.01095191, + "auxiliary_loss_mlp": 0.01083757, + "balance_loss_clip": 1.02147889, + "balance_loss_mlp": 1.00360096, + "epoch": 0.8931641916671677, + "flos": 22419283432320.0, + "grad_norm": 1.764769733588937, + "language_loss": 0.71978748, + "learning_rate": 1.1843380200271425e-07, + "loss": 0.74157697, + "num_input_tokens_seen": 160662495, + "step": 7428, + "time_per_iteration": 2.8275749683380127 + }, + { + "auxiliary_loss_clip": 0.01091575, + "auxiliary_loss_mlp": 0.01085215, + "balance_loss_clip": 1.02494681, + "balance_loss_mlp": 1.00510657, + "epoch": 0.8932844345578068, + "flos": 25843149786240.0, + "grad_norm": 1.6943859622958257, + "language_loss": 0.80239499, + "learning_rate": 1.181698662589805e-07, + "loss": 0.82416284, + "num_input_tokens_seen": 160682080, + "step": 7429, + "time_per_iteration": 2.7994704246520996 + }, + { + "auxiliary_loss_clip": 0.01124791, + "auxiliary_loss_mlp": 0.01083185, + "balance_loss_clip": 1.02427006, + "balance_loss_mlp": 1.00312376, + "epoch": 0.8934046774484459, + "flos": 22925803069440.0, + "grad_norm": 1.7046892547245602, + "language_loss": 0.76057565, + "learning_rate": 1.1790621598738249e-07, + "loss": 0.78265536, + "num_input_tokens_seen": 160700395, + "step": 7430, + "time_per_iteration": 2.7088217735290527 + }, + { + "auxiliary_loss_clip": 0.01136334, + "auxiliary_loss_mlp": 0.0108411, + "balance_loss_clip": 1.02750158, + "balance_loss_mlp": 1.00409627, + "epoch": 0.8935249203390849, + "flos": 24462097718400.0, + "grad_norm": 1.7190623641893663, + "language_loss": 0.74881917, + "learning_rate": 1.1764285122791461e-07, + "loss": 0.77102363, + "num_input_tokens_seen": 160721115, + "step": 7431, + "time_per_iteration": 2.6619997024536133 + }, + { + "auxiliary_loss_clip": 0.01125663, + "auxiliary_loss_mlp": 0.01083745, + "balance_loss_clip": 1.02482438, + "balance_loss_mlp": 1.00368381, + "epoch": 0.8936451632297241, + "flos": 15742735966080.0, + "grad_norm": 1.8850488415045688, + "language_loss": 0.77560842, + "learning_rate": 1.173797720205294e-07, + "loss": 0.79770249, + "num_input_tokens_seen": 160739150, + "step": 7432, + "time_per_iteration": 2.6699795722961426 + }, + { + "auxiliary_loss_clip": 0.01124875, + "auxiliary_loss_mlp": 0.01083298, + "balance_loss_clip": 1.02514577, + "balance_loss_mlp": 1.0030942, + "epoch": 0.8937654061203631, + "flos": 35115500396160.0, + "grad_norm": 2.1606519243341897, + "language_loss": 0.7146734, + "learning_rate": 1.1711697840513602e-07, + "loss": 0.73675507, + "num_input_tokens_seen": 160758585, + "step": 7433, + "time_per_iteration": 2.7524120807647705 + }, + { + "auxiliary_loss_clip": 0.01125813, + "auxiliary_loss_mlp": 0.01084364, + "balance_loss_clip": 1.02466035, + "balance_loss_mlp": 1.00425565, + "epoch": 0.8938856490110022, + "flos": 16107444708480.0, + "grad_norm": 1.9297705489961465, + "language_loss": 0.70736611, + "learning_rate": 1.1685447042160012e-07, + "loss": 0.72946787, + "num_input_tokens_seen": 160776620, + "step": 7434, + "time_per_iteration": 2.667555570602417 + }, + { + "auxiliary_loss_clip": 0.01136176, + "auxiliary_loss_mlp": 0.01084209, + "balance_loss_clip": 1.02653444, + "balance_loss_mlp": 1.00400507, + "epoch": 0.8940058919016414, + "flos": 20704189858560.0, + "grad_norm": 1.4802317845293431, + "language_loss": 0.7141844, + "learning_rate": 1.1659224810974367e-07, + "loss": 0.73638821, + "num_input_tokens_seen": 160796580, + "step": 7435, + "time_per_iteration": 2.6319727897644043 + }, + { + "auxiliary_loss_clip": 0.01114991, + "auxiliary_loss_mlp": 0.01084041, + "balance_loss_clip": 1.02413249, + "balance_loss_mlp": 1.00398004, + "epoch": 0.8941261347922804, + "flos": 25229041937280.0, + "grad_norm": 1.4784885473572547, + "language_loss": 0.68258089, + "learning_rate": 1.1633031150934591e-07, + "loss": 0.70457119, + "num_input_tokens_seen": 160819610, + "step": 7436, + "time_per_iteration": 2.8193185329437256 + }, + { + "auxiliary_loss_clip": 0.01124782, + "auxiliary_loss_mlp": 0.01083988, + "balance_loss_clip": 1.02512872, + "balance_loss_mlp": 1.00392711, + "epoch": 0.8942463776829195, + "flos": 19537236806400.0, + "grad_norm": 1.9609174749432443, + "language_loss": 0.80136406, + "learning_rate": 1.1606866066014176e-07, + "loss": 0.82345182, + "num_input_tokens_seen": 160838660, + "step": 7437, + "time_per_iteration": 3.51898193359375 + }, + { + "auxiliary_loss_clip": 0.01104896, + "auxiliary_loss_mlp": 0.01083685, + "balance_loss_clip": 1.02350187, + "balance_loss_mlp": 1.00371921, + "epoch": 0.8943666205735585, + "flos": 22301567585280.0, + "grad_norm": 2.039398574734639, + "language_loss": 0.7541995, + "learning_rate": 1.1580729560182434e-07, + "loss": 0.77608532, + "num_input_tokens_seen": 160854515, + "step": 7438, + "time_per_iteration": 2.6899468898773193 + }, + { + "auxiliary_loss_clip": 0.01134204, + "auxiliary_loss_mlp": 0.00872882, + "balance_loss_clip": 1.02523875, + "balance_loss_mlp": 1.00009823, + "epoch": 0.8944868634641977, + "flos": 18912893581440.0, + "grad_norm": 2.1941054697848683, + "language_loss": 0.7092551, + "learning_rate": 1.1554621637404171e-07, + "loss": 0.72932595, + "num_input_tokens_seen": 160872605, + "step": 7439, + "time_per_iteration": 3.5358965396881104 + }, + { + "auxiliary_loss_clip": 0.0112483, + "auxiliary_loss_mlp": 0.01082765, + "balance_loss_clip": 1.02378619, + "balance_loss_mlp": 1.00279939, + "epoch": 0.8946071063548368, + "flos": 14460904241280.0, + "grad_norm": 2.244591490724717, + "language_loss": 0.61239433, + "learning_rate": 1.1528542301639999e-07, + "loss": 0.63447022, + "num_input_tokens_seen": 160889395, + "step": 7440, + "time_per_iteration": 2.6189651489257812 + }, + { + "auxiliary_loss_clip": 0.01105034, + "auxiliary_loss_mlp": 0.01084366, + "balance_loss_clip": 1.02198577, + "balance_loss_mlp": 1.00416207, + "epoch": 0.8947273492454758, + "flos": 20084084438400.0, + "grad_norm": 2.2714062552967618, + "language_loss": 0.82660222, + "learning_rate": 1.1502491556846105e-07, + "loss": 0.84849626, + "num_input_tokens_seen": 160907890, + "step": 7441, + "time_per_iteration": 2.822366952896118 + }, + { + "auxiliary_loss_clip": 0.01114132, + "auxiliary_loss_mlp": 0.01082389, + "balance_loss_clip": 1.02342057, + "balance_loss_mlp": 1.00237608, + "epoch": 0.894847592136115, + "flos": 18550555136640.0, + "grad_norm": 3.586075596102705, + "language_loss": 0.81200993, + "learning_rate": 1.1476469406974331e-07, + "loss": 0.8339752, + "num_input_tokens_seen": 160923490, + "step": 7442, + "time_per_iteration": 2.796818494796753 + }, + { + "auxiliary_loss_clip": 0.01134935, + "auxiliary_loss_mlp": 0.01084202, + "balance_loss_clip": 1.0262301, + "balance_loss_mlp": 1.00418913, + "epoch": 0.894967835026754, + "flos": 23478468704640.0, + "grad_norm": 1.5507412886935046, + "language_loss": 0.76979327, + "learning_rate": 1.1450475855972341e-07, + "loss": 0.79198462, + "num_input_tokens_seen": 160944280, + "step": 7443, + "time_per_iteration": 4.511506795883179 + }, + { + "auxiliary_loss_clip": 0.0111438, + "auxiliary_loss_mlp": 0.00872905, + "balance_loss_clip": 1.0226357, + "balance_loss_mlp": 1.00008035, + "epoch": 0.8950880779173931, + "flos": 15188310564480.0, + "grad_norm": 1.928692016724518, + "language_loss": 0.70545816, + "learning_rate": 1.1424510907783158e-07, + "loss": 0.72533095, + "num_input_tokens_seen": 160961560, + "step": 7444, + "time_per_iteration": 2.6272971630096436 + }, + { + "auxiliary_loss_clip": 0.01117164, + "auxiliary_loss_mlp": 0.01083805, + "balance_loss_clip": 1.0237931, + "balance_loss_mlp": 1.00374365, + "epoch": 0.8952083208080323, + "flos": 22091957769600.0, + "grad_norm": 1.5597087512874523, + "language_loss": 0.82675219, + "learning_rate": 1.1398574566345787e-07, + "loss": 0.84876192, + "num_input_tokens_seen": 160982195, + "step": 7445, + "time_per_iteration": 2.707533836364746 + }, + { + "auxiliary_loss_clip": 0.01117817, + "auxiliary_loss_mlp": 0.01084088, + "balance_loss_clip": 1.02479374, + "balance_loss_mlp": 1.00388372, + "epoch": 0.8953285636986713, + "flos": 23254026572160.0, + "grad_norm": 1.9961727981271957, + "language_loss": 0.82125068, + "learning_rate": 1.1372666835594702e-07, + "loss": 0.84326971, + "num_input_tokens_seen": 161000520, + "step": 7446, + "time_per_iteration": 2.760331630706787 + }, + { + "auxiliary_loss_clip": 0.01112696, + "auxiliary_loss_mlp": 0.01083665, + "balance_loss_clip": 1.02270257, + "balance_loss_mlp": 1.00365162, + "epoch": 0.8954488065893104, + "flos": 16362661818240.0, + "grad_norm": 1.8889014619551696, + "language_loss": 0.71951205, + "learning_rate": 1.1346787719460071e-07, + "loss": 0.74147564, + "num_input_tokens_seen": 161019405, + "step": 7447, + "time_per_iteration": 2.819816827774048 + }, + { + "auxiliary_loss_clip": 0.01112611, + "auxiliary_loss_mlp": 0.01084332, + "balance_loss_clip": 1.0221833, + "balance_loss_mlp": 1.00427067, + "epoch": 0.8955690494799495, + "flos": 18257883120000.0, + "grad_norm": 1.6966486420260487, + "language_loss": 0.72372448, + "learning_rate": 1.1320937221867732e-07, + "loss": 0.74569392, + "num_input_tokens_seen": 161036985, + "step": 7448, + "time_per_iteration": 2.6822214126586914 + }, + { + "auxiliary_loss_clip": 0.01117145, + "auxiliary_loss_mlp": 0.01083318, + "balance_loss_clip": 1.02484012, + "balance_loss_mlp": 1.00330472, + "epoch": 0.8956892923705886, + "flos": 25447486498560.0, + "grad_norm": 1.69767128773196, + "language_loss": 0.79721004, + "learning_rate": 1.1295115346739192e-07, + "loss": 0.8192147, + "num_input_tokens_seen": 161056985, + "step": 7449, + "time_per_iteration": 2.797607421875 + }, + { + "auxiliary_loss_clip": 0.01113783, + "auxiliary_loss_mlp": 0.01084072, + "balance_loss_clip": 1.02251327, + "balance_loss_mlp": 1.0038681, + "epoch": 0.8958095352612276, + "flos": 52661883939840.0, + "grad_norm": 2.4968326760678843, + "language_loss": 0.73389953, + "learning_rate": 1.1269322097991629e-07, + "loss": 0.75587809, + "num_input_tokens_seen": 161080270, + "step": 7450, + "time_per_iteration": 3.0139591693878174 + }, + { + "auxiliary_loss_clip": 0.01125368, + "auxiliary_loss_mlp": 0.01085771, + "balance_loss_clip": 1.02492273, + "balance_loss_mlp": 1.00547206, + "epoch": 0.8959297781518668, + "flos": 23186335392000.0, + "grad_norm": 2.356191224985513, + "language_loss": 0.67728198, + "learning_rate": 1.1243557479537846e-07, + "loss": 0.69939339, + "num_input_tokens_seen": 161100160, + "step": 7451, + "time_per_iteration": 2.68221378326416 + }, + { + "auxiliary_loss_clip": 0.0113366, + "auxiliary_loss_mlp": 0.01084454, + "balance_loss_clip": 1.02484941, + "balance_loss_mlp": 1.00429749, + "epoch": 0.8960500210425059, + "flos": 20334309557760.0, + "grad_norm": 2.0013441627182242, + "language_loss": 0.68710679, + "learning_rate": 1.121782149528634e-07, + "loss": 0.70928794, + "num_input_tokens_seen": 161117260, + "step": 7452, + "time_per_iteration": 2.6986212730407715 + }, + { + "auxiliary_loss_clip": 0.01096258, + "auxiliary_loss_mlp": 0.01083682, + "balance_loss_clip": 1.0222683, + "balance_loss_mlp": 1.00366902, + "epoch": 0.8961702639331449, + "flos": 19901694153600.0, + "grad_norm": 1.861611340045278, + "language_loss": 0.7878322, + "learning_rate": 1.1192114149141208e-07, + "loss": 0.80963159, + "num_input_tokens_seen": 161136895, + "step": 7453, + "time_per_iteration": 2.7049875259399414 + }, + { + "auxiliary_loss_clip": 0.01116982, + "auxiliary_loss_mlp": 0.01084111, + "balance_loss_clip": 1.02438927, + "balance_loss_mlp": 1.00385964, + "epoch": 0.8962905068237841, + "flos": 12896348567040.0, + "grad_norm": 2.0034855728941934, + "language_loss": 0.65652728, + "learning_rate": 1.1166435445002197e-07, + "loss": 0.67853826, + "num_input_tokens_seen": 161154565, + "step": 7454, + "time_per_iteration": 2.711139678955078 + }, + { + "auxiliary_loss_clip": 0.01124909, + "auxiliary_loss_mlp": 0.01082751, + "balance_loss_clip": 1.02462077, + "balance_loss_mlp": 1.00269008, + "epoch": 0.8964107497144231, + "flos": 23440331439360.0, + "grad_norm": 1.9606255311986822, + "language_loss": 0.68621522, + "learning_rate": 1.1140785386764818e-07, + "loss": 0.70829183, + "num_input_tokens_seen": 161173265, + "step": 7455, + "time_per_iteration": 2.6603360176086426 + }, + { + "auxiliary_loss_clip": 0.0112599, + "auxiliary_loss_mlp": 0.01083854, + "balance_loss_clip": 1.02533472, + "balance_loss_mlp": 1.00374508, + "epoch": 0.8965309926050622, + "flos": 19500176949120.0, + "grad_norm": 1.897134189538457, + "language_loss": 0.69597936, + "learning_rate": 1.1115163978320153e-07, + "loss": 0.71807778, + "num_input_tokens_seen": 161191995, + "step": 7456, + "time_per_iteration": 2.688375949859619 + }, + { + "auxiliary_loss_clip": 0.01127599, + "auxiliary_loss_mlp": 0.00872928, + "balance_loss_clip": 1.02650523, + "balance_loss_mlp": 1.0000999, + "epoch": 0.8966512354957014, + "flos": 28658008022400.0, + "grad_norm": 2.0609443825902254, + "language_loss": 0.82391065, + "learning_rate": 1.1089571223554917e-07, + "loss": 0.84391588, + "num_input_tokens_seen": 161212880, + "step": 7457, + "time_per_iteration": 2.6711044311523438 + }, + { + "auxiliary_loss_clip": 0.01125816, + "auxiliary_loss_mlp": 0.01084438, + "balance_loss_clip": 1.02457428, + "balance_loss_mlp": 1.00437748, + "epoch": 0.8967714783863404, + "flos": 23370916406400.0, + "grad_norm": 1.7750732880645912, + "language_loss": 0.85409749, + "learning_rate": 1.1064007126351537e-07, + "loss": 0.87619996, + "num_input_tokens_seen": 161233595, + "step": 7458, + "time_per_iteration": 2.73180890083313 + }, + { + "auxiliary_loss_clip": 0.01113718, + "auxiliary_loss_mlp": 0.01084491, + "balance_loss_clip": 1.02397132, + "balance_loss_mlp": 1.00438237, + "epoch": 0.8968917212769795, + "flos": 24535175938560.0, + "grad_norm": 4.042370443574951, + "language_loss": 0.76232624, + "learning_rate": 1.1038471690588003e-07, + "loss": 0.78430837, + "num_input_tokens_seen": 161252740, + "step": 7459, + "time_per_iteration": 2.7321271896362305 + }, + { + "auxiliary_loss_clip": 0.01091308, + "auxiliary_loss_mlp": 0.01084495, + "balance_loss_clip": 1.01904702, + "balance_loss_mlp": 1.00438643, + "epoch": 0.8970119641676186, + "flos": 23475416048640.0, + "grad_norm": 2.0733022634072342, + "language_loss": 0.79881191, + "learning_rate": 1.1012964920138145e-07, + "loss": 0.82056993, + "num_input_tokens_seen": 161272325, + "step": 7460, + "time_per_iteration": 2.7949087619781494 + }, + { + "auxiliary_loss_clip": 0.01117966, + "auxiliary_loss_mlp": 0.01084219, + "balance_loss_clip": 1.02463484, + "balance_loss_mlp": 1.0041585, + "epoch": 0.8971322070582577, + "flos": 24538192680960.0, + "grad_norm": 1.6010132450808008, + "language_loss": 0.75858033, + "learning_rate": 1.0987486818871205e-07, + "loss": 0.78060222, + "num_input_tokens_seen": 161295915, + "step": 7461, + "time_per_iteration": 2.8377737998962402 + }, + { + "auxiliary_loss_clip": 0.01124981, + "auxiliary_loss_mlp": 0.00872923, + "balance_loss_clip": 1.02496684, + "balance_loss_mlp": 1.00010669, + "epoch": 0.8972524499488967, + "flos": 21797454159360.0, + "grad_norm": 2.0776763277034465, + "language_loss": 0.72936594, + "learning_rate": 1.0962037390652245e-07, + "loss": 0.74934494, + "num_input_tokens_seen": 161314935, + "step": 7462, + "time_per_iteration": 2.6914618015289307 + }, + { + "auxiliary_loss_clip": 0.01111599, + "auxiliary_loss_mlp": 0.01084791, + "balance_loss_clip": 1.02105594, + "balance_loss_mlp": 1.00458741, + "epoch": 0.8973726928395359, + "flos": 21726243446400.0, + "grad_norm": 2.3075291750445213, + "language_loss": 0.72208524, + "learning_rate": 1.0936616639341911e-07, + "loss": 0.74404913, + "num_input_tokens_seen": 161335225, + "step": 7463, + "time_per_iteration": 3.587846517562866 + }, + { + "auxiliary_loss_clip": 0.01099576, + "auxiliary_loss_mlp": 0.01078906, + "balance_loss_clip": 1.02184212, + "balance_loss_mlp": 0.99994165, + "epoch": 0.897492935730175, + "flos": 53837100097920.0, + "grad_norm": 0.7305221571937272, + "language_loss": 0.54761803, + "learning_rate": 1.0911224568796473e-07, + "loss": 0.56940281, + "num_input_tokens_seen": 161393420, + "step": 7464, + "time_per_iteration": 4.194063663482666 + }, + { + "auxiliary_loss_clip": 0.01125096, + "auxiliary_loss_mlp": 0.01083803, + "balance_loss_clip": 1.02564692, + "balance_loss_mlp": 1.00369442, + "epoch": 0.897613178620814, + "flos": 18290346036480.0, + "grad_norm": 1.8391358713504182, + "language_loss": 0.70944387, + "learning_rate": 1.0885861182867984e-07, + "loss": 0.73153287, + "num_input_tokens_seen": 161411525, + "step": 7465, + "time_per_iteration": 2.6287195682525635 + }, + { + "auxiliary_loss_clip": 0.01116263, + "auxiliary_loss_mlp": 0.01084833, + "balance_loss_clip": 1.02431345, + "balance_loss_mlp": 1.00467682, + "epoch": 0.8977334215114532, + "flos": 32993718059520.0, + "grad_norm": 3.6794259480627667, + "language_loss": 0.70678777, + "learning_rate": 1.0860526485403942e-07, + "loss": 0.72879869, + "num_input_tokens_seen": 161432800, + "step": 7466, + "time_per_iteration": 2.826699733734131 + }, + { + "auxiliary_loss_clip": 0.01134709, + "auxiliary_loss_mlp": 0.01083778, + "balance_loss_clip": 1.02579248, + "balance_loss_mlp": 1.0037173, + "epoch": 0.8978536644020922, + "flos": 15195636938880.0, + "grad_norm": 1.6387197610113837, + "language_loss": 0.77185428, + "learning_rate": 1.0835220480247675e-07, + "loss": 0.79403913, + "num_input_tokens_seen": 161451295, + "step": 7467, + "time_per_iteration": 2.629603862762451 + }, + { + "auxiliary_loss_clip": 0.01108783, + "auxiliary_loss_mlp": 0.01083557, + "balance_loss_clip": 1.02390826, + "balance_loss_mlp": 1.00344872, + "epoch": 0.8979739072927313, + "flos": 18004389863040.0, + "grad_norm": 2.284250308638524, + "language_loss": 0.83603936, + "learning_rate": 1.0809943171238067e-07, + "loss": 0.85796273, + "num_input_tokens_seen": 161469220, + "step": 7468, + "time_per_iteration": 3.6697885990142822 + }, + { + "auxiliary_loss_clip": 0.01116752, + "auxiliary_loss_mlp": 0.01085014, + "balance_loss_clip": 1.02432656, + "balance_loss_mlp": 1.00476241, + "epoch": 0.8980941501833704, + "flos": 22271546793600.0, + "grad_norm": 23.675172643904233, + "language_loss": 0.62716597, + "learning_rate": 1.078469456220965e-07, + "loss": 0.64918363, + "num_input_tokens_seen": 161489375, + "step": 7469, + "time_per_iteration": 3.7926723957061768 + }, + { + "auxiliary_loss_clip": 0.01109882, + "auxiliary_loss_mlp": 0.01083476, + "balance_loss_clip": 1.02605581, + "balance_loss_mlp": 1.00341511, + "epoch": 0.8982143930740095, + "flos": 37560729726720.0, + "grad_norm": 1.8193609035666165, + "language_loss": 0.69662118, + "learning_rate": 1.0759474656992606e-07, + "loss": 0.71855468, + "num_input_tokens_seen": 161512145, + "step": 7470, + "time_per_iteration": 2.8477988243103027 + }, + { + "auxiliary_loss_clip": 0.01116958, + "auxiliary_loss_mlp": 0.01083214, + "balance_loss_clip": 1.02406192, + "balance_loss_mlp": 1.00310564, + "epoch": 0.8983346359646486, + "flos": 18076893465600.0, + "grad_norm": 2.6207747796239054, + "language_loss": 0.77353287, + "learning_rate": 1.0734283459412785e-07, + "loss": 0.79553461, + "num_input_tokens_seen": 161528995, + "step": 7471, + "time_per_iteration": 2.771484136581421 + }, + { + "auxiliary_loss_clip": 0.01097473, + "auxiliary_loss_mlp": 0.01083184, + "balance_loss_clip": 1.02365446, + "balance_loss_mlp": 1.00293255, + "epoch": 0.8984548788552876, + "flos": 20558895344640.0, + "grad_norm": 2.0505893649487175, + "language_loss": 0.80896074, + "learning_rate": 1.0709120973291707e-07, + "loss": 0.83076727, + "num_input_tokens_seen": 161548775, + "step": 7472, + "time_per_iteration": 2.879744052886963 + }, + { + "auxiliary_loss_clip": 0.01136061, + "auxiliary_loss_mlp": 0.01085425, + "balance_loss_clip": 1.02683902, + "balance_loss_mlp": 1.00517321, + "epoch": 0.8985751217459268, + "flos": 17785442511360.0, + "grad_norm": 1.9292971445675848, + "language_loss": 0.77863216, + "learning_rate": 1.0683987202446475e-07, + "loss": 0.80084693, + "num_input_tokens_seen": 161566960, + "step": 7473, + "time_per_iteration": 2.687079668045044 + }, + { + "auxiliary_loss_clip": 0.01127101, + "auxiliary_loss_mlp": 0.0108303, + "balance_loss_clip": 1.02583206, + "balance_loss_mlp": 1.00292122, + "epoch": 0.8986953646365659, + "flos": 21617003208960.0, + "grad_norm": 1.720635076821828, + "language_loss": 0.69756138, + "learning_rate": 1.0658882150689862e-07, + "loss": 0.71966267, + "num_input_tokens_seen": 161585820, + "step": 7474, + "time_per_iteration": 2.705261707305908 + }, + { + "auxiliary_loss_clip": 0.01106124, + "auxiliary_loss_mlp": 0.01084881, + "balance_loss_clip": 1.02336478, + "balance_loss_mlp": 1.00462985, + "epoch": 0.8988156075272049, + "flos": 14027355083520.0, + "grad_norm": 3.435758331857142, + "language_loss": 0.78548193, + "learning_rate": 1.0633805821830288e-07, + "loss": 0.807392, + "num_input_tokens_seen": 161602505, + "step": 7475, + "time_per_iteration": 2.7412755489349365 + }, + { + "auxiliary_loss_clip": 0.01117627, + "auxiliary_loss_mlp": 0.01084222, + "balance_loss_clip": 1.02537584, + "balance_loss_mlp": 1.00401783, + "epoch": 0.8989358504178441, + "flos": 29059202004480.0, + "grad_norm": 2.261838988070357, + "language_loss": 0.82991695, + "learning_rate": 1.0608758219671753e-07, + "loss": 0.85193539, + "num_input_tokens_seen": 161621545, + "step": 7476, + "time_per_iteration": 2.7943294048309326 + }, + { + "auxiliary_loss_clip": 0.01116959, + "auxiliary_loss_mlp": 0.01083711, + "balance_loss_clip": 1.0243721, + "balance_loss_mlp": 1.00364971, + "epoch": 0.8990560933084831, + "flos": 20230420446720.0, + "grad_norm": 1.573544033857551, + "language_loss": 0.70695937, + "learning_rate": 1.0583739348014065e-07, + "loss": 0.72896606, + "num_input_tokens_seen": 161642630, + "step": 7477, + "time_per_iteration": 2.708444118499756 + }, + { + "auxiliary_loss_clip": 0.01136774, + "auxiliary_loss_mlp": 0.01083745, + "balance_loss_clip": 1.02735353, + "balance_loss_mlp": 1.0036844, + "epoch": 0.8991763361991222, + "flos": 25520672459520.0, + "grad_norm": 2.0401864731903347, + "language_loss": 0.84681845, + "learning_rate": 1.0558749210652518e-07, + "loss": 0.86902356, + "num_input_tokens_seen": 161662560, + "step": 7478, + "time_per_iteration": 2.700590133666992 + }, + { + "auxiliary_loss_clip": 0.01106058, + "auxiliary_loss_mlp": 0.01083422, + "balance_loss_clip": 1.02290499, + "balance_loss_mlp": 1.00331318, + "epoch": 0.8992965790897613, + "flos": 25119191168640.0, + "grad_norm": 1.6777442609977113, + "language_loss": 0.85553646, + "learning_rate": 1.053378781137808e-07, + "loss": 0.87743127, + "num_input_tokens_seen": 161683480, + "step": 7479, + "time_per_iteration": 2.7217087745666504 + }, + { + "auxiliary_loss_clip": 0.01099443, + "auxiliary_loss_mlp": 0.01084932, + "balance_loss_clip": 1.0239675, + "balance_loss_mlp": 1.00477529, + "epoch": 0.8994168219804004, + "flos": 16070815814400.0, + "grad_norm": 1.794257495031174, + "language_loss": 0.77596986, + "learning_rate": 1.0508855153977392e-07, + "loss": 0.79781359, + "num_input_tokens_seen": 161699945, + "step": 7480, + "time_per_iteration": 2.692199230194092 + }, + { + "auxiliary_loss_clip": 0.0112547, + "auxiliary_loss_mlp": 0.01083258, + "balance_loss_clip": 1.02468145, + "balance_loss_mlp": 1.00319755, + "epoch": 0.8995370648710395, + "flos": 24825764966400.0, + "grad_norm": 2.1703749117216047, + "language_loss": 0.66985798, + "learning_rate": 1.0483951242232669e-07, + "loss": 0.69194531, + "num_input_tokens_seen": 161720420, + "step": 7481, + "time_per_iteration": 2.604203939437866 + }, + { + "auxiliary_loss_clip": 0.01112393, + "auxiliary_loss_mlp": 0.01078816, + "balance_loss_clip": 1.01727986, + "balance_loss_mlp": 0.99985152, + "epoch": 0.8996573077616786, + "flos": 63116238378240.0, + "grad_norm": 0.9677756453196601, + "language_loss": 0.57732475, + "learning_rate": 1.0459076079921936e-07, + "loss": 0.59923685, + "num_input_tokens_seen": 161773080, + "step": 7482, + "time_per_iteration": 3.2290165424346924 + }, + { + "auxiliary_loss_clip": 0.01117792, + "auxiliary_loss_mlp": 0.01084738, + "balance_loss_clip": 1.02604544, + "balance_loss_mlp": 1.00467741, + "epoch": 0.8997775506523177, + "flos": 18219674027520.0, + "grad_norm": 2.4290195151260576, + "language_loss": 0.85158277, + "learning_rate": 1.0434229670818618e-07, + "loss": 0.87360811, + "num_input_tokens_seen": 161789755, + "step": 7483, + "time_per_iteration": 2.5591249465942383 + }, + { + "auxiliary_loss_clip": 0.01116548, + "auxiliary_loss_mlp": 0.01083766, + "balance_loss_clip": 1.02446294, + "balance_loss_mlp": 1.00365734, + "epoch": 0.8998977935429567, + "flos": 24166768095360.0, + "grad_norm": 1.3681649622302072, + "language_loss": 0.79928958, + "learning_rate": 1.0409412018691944e-07, + "loss": 0.82129264, + "num_input_tokens_seen": 161810220, + "step": 7484, + "time_per_iteration": 2.6090571880340576 + }, + { + "auxiliary_loss_clip": 0.01118212, + "auxiliary_loss_mlp": 0.01085093, + "balance_loss_clip": 1.02603567, + "balance_loss_mlp": 1.0049367, + "epoch": 0.9000180364335959, + "flos": 20773030273920.0, + "grad_norm": 1.8211566409599853, + "language_loss": 0.74993002, + "learning_rate": 1.0384623127306724e-07, + "loss": 0.77196312, + "num_input_tokens_seen": 161827565, + "step": 7485, + "time_per_iteration": 2.692556381225586 + }, + { + "auxiliary_loss_clip": 0.01109086, + "auxiliary_loss_mlp": 0.01084612, + "balance_loss_clip": 1.0246613, + "balance_loss_mlp": 1.00450349, + "epoch": 0.900138279324235, + "flos": 19205745166080.0, + "grad_norm": 1.7317372286528707, + "language_loss": 0.79436338, + "learning_rate": 1.0359863000423397e-07, + "loss": 0.81630033, + "num_input_tokens_seen": 161845700, + "step": 7486, + "time_per_iteration": 2.7119758129119873 + }, + { + "auxiliary_loss_clip": 0.01134748, + "auxiliary_loss_mlp": 0.01084204, + "balance_loss_clip": 1.02502847, + "balance_loss_mlp": 1.00404763, + "epoch": 0.900258522214874, + "flos": 28731158069760.0, + "grad_norm": 1.6417969629441038, + "language_loss": 0.72257113, + "learning_rate": 1.0335131641798112e-07, + "loss": 0.74476063, + "num_input_tokens_seen": 161867660, + "step": 7487, + "time_per_iteration": 2.6650445461273193 + }, + { + "auxiliary_loss_clip": 0.01095165, + "auxiliary_loss_mlp": 0.01078993, + "balance_loss_clip": 1.01706648, + "balance_loss_mlp": 1.00002885, + "epoch": 0.9003787651055132, + "flos": 58280685655680.0, + "grad_norm": 0.8049812632343336, + "language_loss": 0.55694985, + "learning_rate": 1.0310429055182512e-07, + "loss": 0.57869136, + "num_input_tokens_seen": 161921980, + "step": 7488, + "time_per_iteration": 4.000751495361328 + }, + { + "auxiliary_loss_clip": 0.0110661, + "auxiliary_loss_mlp": 0.01084916, + "balance_loss_clip": 1.02339005, + "balance_loss_mlp": 1.00471163, + "epoch": 0.9004990079961522, + "flos": 25556475340800.0, + "grad_norm": 1.671991501738669, + "language_loss": 0.74328423, + "learning_rate": 1.0285755244324024e-07, + "loss": 0.76519948, + "num_input_tokens_seen": 161942725, + "step": 7489, + "time_per_iteration": 2.8222787380218506 + }, + { + "auxiliary_loss_clip": 0.01117088, + "auxiliary_loss_mlp": 0.00872808, + "balance_loss_clip": 1.0241729, + "balance_loss_mlp": 1.00007057, + "epoch": 0.9006192508867913, + "flos": 23335185352320.0, + "grad_norm": 1.4230363814343001, + "language_loss": 0.68380153, + "learning_rate": 1.0261110212965629e-07, + "loss": 0.70370048, + "num_input_tokens_seen": 161964520, + "step": 7490, + "time_per_iteration": 3.880011796951294 + }, + { + "auxiliary_loss_clip": 0.01117712, + "auxiliary_loss_mlp": 0.01083746, + "balance_loss_clip": 1.02521491, + "balance_loss_mlp": 1.003685, + "epoch": 0.9007394937774305, + "flos": 18040300485120.0, + "grad_norm": 2.1211842871848607, + "language_loss": 0.79114854, + "learning_rate": 1.023649396484596e-07, + "loss": 0.81316316, + "num_input_tokens_seen": 161983575, + "step": 7491, + "time_per_iteration": 2.6635429859161377 + }, + { + "auxiliary_loss_clip": 0.01134434, + "auxiliary_loss_mlp": 0.01084115, + "balance_loss_clip": 1.02528024, + "balance_loss_mlp": 1.00400603, + "epoch": 0.9008597366680695, + "flos": 43068456633600.0, + "grad_norm": 2.070767621623002, + "language_loss": 0.67765844, + "learning_rate": 1.0211906503699275e-07, + "loss": 0.69984394, + "num_input_tokens_seen": 162006550, + "step": 7492, + "time_per_iteration": 2.8495354652404785 + }, + { + "auxiliary_loss_clip": 0.0112582, + "auxiliary_loss_mlp": 0.0108474, + "balance_loss_clip": 1.02581072, + "balance_loss_mlp": 1.00463128, + "epoch": 0.9009799795587086, + "flos": 14939055112320.0, + "grad_norm": 2.2103754738932726, + "language_loss": 0.82294583, + "learning_rate": 1.0187347833255455e-07, + "loss": 0.84505147, + "num_input_tokens_seen": 162022455, + "step": 7493, + "time_per_iteration": 3.5559511184692383 + }, + { + "auxiliary_loss_clip": 0.01134364, + "auxiliary_loss_mlp": 0.01083232, + "balance_loss_clip": 1.02578044, + "balance_loss_mlp": 1.00307548, + "epoch": 0.9011002224493477, + "flos": 21579584215680.0, + "grad_norm": 1.5835530623247813, + "language_loss": 0.79081428, + "learning_rate": 1.0162817957240056e-07, + "loss": 0.81299025, + "num_input_tokens_seen": 162042350, + "step": 7494, + "time_per_iteration": 3.532381772994995 + }, + { + "auxiliary_loss_clip": 0.01104339, + "auxiliary_loss_mlp": 0.01078922, + "balance_loss_clip": 1.01730716, + "balance_loss_mlp": 0.99995768, + "epoch": 0.9012204653399868, + "flos": 71166367883520.0, + "grad_norm": 0.8842084219106815, + "language_loss": 0.63094449, + "learning_rate": 1.0138316879374253e-07, + "loss": 0.65277714, + "num_input_tokens_seen": 162111640, + "step": 7495, + "time_per_iteration": 3.4321515560150146 + }, + { + "auxiliary_loss_clip": 0.01115436, + "auxiliary_loss_mlp": 0.01085447, + "balance_loss_clip": 1.02438605, + "balance_loss_mlp": 1.0052911, + "epoch": 0.9013407082306258, + "flos": 15594963413760.0, + "grad_norm": 2.2684387607039382, + "language_loss": 0.74115586, + "learning_rate": 1.0113844603374833e-07, + "loss": 0.7631647, + "num_input_tokens_seen": 162128165, + "step": 7496, + "time_per_iteration": 2.73213529586792 + }, + { + "auxiliary_loss_clip": 0.01116807, + "auxiliary_loss_mlp": 0.01083922, + "balance_loss_clip": 1.02410269, + "balance_loss_mlp": 1.00381327, + "epoch": 0.901460951121265, + "flos": 15049157276160.0, + "grad_norm": 2.809841370193149, + "language_loss": 0.72030461, + "learning_rate": 1.0089401132954178e-07, + "loss": 0.74231184, + "num_input_tokens_seen": 162146145, + "step": 7497, + "time_per_iteration": 2.7061920166015625 + }, + { + "auxiliary_loss_clip": 0.01116835, + "auxiliary_loss_mlp": 0.0108395, + "balance_loss_clip": 1.02488065, + "balance_loss_mlp": 1.00379348, + "epoch": 0.9015811940119041, + "flos": 22236857233920.0, + "grad_norm": 2.4469532133672085, + "language_loss": 0.72441244, + "learning_rate": 1.006498647182037e-07, + "loss": 0.74642026, + "num_input_tokens_seen": 162164800, + "step": 7498, + "time_per_iteration": 2.7030417919158936 + }, + { + "auxiliary_loss_clip": 0.01091403, + "auxiliary_loss_mlp": 0.0108445, + "balance_loss_clip": 1.02352357, + "balance_loss_mlp": 1.00434101, + "epoch": 0.9017014369025431, + "flos": 24973824827520.0, + "grad_norm": 2.112798581097609, + "language_loss": 0.7177158, + "learning_rate": 1.004060062367713e-07, + "loss": 0.73947436, + "num_input_tokens_seen": 162185895, + "step": 7499, + "time_per_iteration": 2.852461338043213 + }, + { + "auxiliary_loss_clip": 0.01124091, + "auxiliary_loss_mlp": 0.01084368, + "balance_loss_clip": 1.02371001, + "balance_loss_mlp": 1.00416374, + "epoch": 0.9018216797931822, + "flos": 18114168804480.0, + "grad_norm": 1.6503383647046126, + "language_loss": 0.69392914, + "learning_rate": 1.0016243592223728e-07, + "loss": 0.71601373, + "num_input_tokens_seen": 162206295, + "step": 7500, + "time_per_iteration": 2.7038800716400146 + }, + { + "auxiliary_loss_clip": 0.01086987, + "auxiliary_loss_mlp": 0.01083816, + "balance_loss_clip": 1.02200496, + "balance_loss_mlp": 1.00375557, + "epoch": 0.9019419226838213, + "flos": 37268452759680.0, + "grad_norm": 2.64389792898551, + "language_loss": 0.65873063, + "learning_rate": 9.991915381155114e-08, + "loss": 0.6804387, + "num_input_tokens_seen": 162229275, + "step": 7501, + "time_per_iteration": 2.9603559970855713 + }, + { + "auxiliary_loss_clip": 0.01128192, + "auxiliary_loss_mlp": 0.01084317, + "balance_loss_clip": 1.02642834, + "balance_loss_mlp": 1.0042088, + "epoch": 0.9020621655744604, + "flos": 23441121538560.0, + "grad_norm": 2.043250483542922, + "language_loss": 0.74874264, + "learning_rate": 9.967615994161871e-08, + "loss": 0.77086771, + "num_input_tokens_seen": 162248935, + "step": 7502, + "time_per_iteration": 2.70161509513855 + }, + { + "auxiliary_loss_clip": 0.01135474, + "auxiliary_loss_mlp": 0.01083083, + "balance_loss_clip": 1.02617455, + "balance_loss_mlp": 1.0030694, + "epoch": 0.9021824084650995, + "flos": 22857465444480.0, + "grad_norm": 1.9031655736927946, + "language_loss": 0.78350151, + "learning_rate": 9.943345434930161e-08, + "loss": 0.80568707, + "num_input_tokens_seen": 162269185, + "step": 7503, + "time_per_iteration": 2.6669328212738037 + }, + { + "auxiliary_loss_clip": 0.0110528, + "auxiliary_loss_mlp": 0.01083952, + "balance_loss_clip": 1.02357638, + "balance_loss_mlp": 1.00389051, + "epoch": 0.9023026513557386, + "flos": 22127581082880.0, + "grad_norm": 1.977189102513128, + "language_loss": 0.68880022, + "learning_rate": 9.919103707141885e-08, + "loss": 0.71069258, + "num_input_tokens_seen": 162288065, + "step": 7504, + "time_per_iteration": 2.8068289756774902 + }, + { + "auxiliary_loss_clip": 0.01119792, + "auxiliary_loss_mlp": 0.01084132, + "balance_loss_clip": 1.02536321, + "balance_loss_mlp": 1.0038805, + "epoch": 0.9024228942463777, + "flos": 24199087357440.0, + "grad_norm": 1.758212008238461, + "language_loss": 0.7670275, + "learning_rate": 9.89489081447441e-08, + "loss": 0.78906673, + "num_input_tokens_seen": 162305265, + "step": 7505, + "time_per_iteration": 2.6566436290740967 + }, + { + "auxiliary_loss_clip": 0.01116929, + "auxiliary_loss_mlp": 0.01084384, + "balance_loss_clip": 1.02428889, + "balance_loss_mlp": 1.00432301, + "epoch": 0.9025431371370167, + "flos": 25008262992000.0, + "grad_norm": 4.136701419042972, + "language_loss": 0.82717419, + "learning_rate": 9.870706760600844e-08, + "loss": 0.84918725, + "num_input_tokens_seen": 162325215, + "step": 7506, + "time_per_iteration": 2.8262195587158203 + }, + { + "auxiliary_loss_clip": 0.01080246, + "auxiliary_loss_mlp": 0.01084308, + "balance_loss_clip": 1.02441478, + "balance_loss_mlp": 1.00415194, + "epoch": 0.9026633800276559, + "flos": 18952862440320.0, + "grad_norm": 1.9156754150679518, + "language_loss": 0.72811234, + "learning_rate": 9.846551549189918e-08, + "loss": 0.74975789, + "num_input_tokens_seen": 162344820, + "step": 7507, + "time_per_iteration": 2.777294874191284 + }, + { + "auxiliary_loss_clip": 0.01113687, + "auxiliary_loss_mlp": 0.01083239, + "balance_loss_clip": 1.02313423, + "balance_loss_mlp": 1.00313079, + "epoch": 0.902783622918295, + "flos": 32416059536640.0, + "grad_norm": 2.3629220715424477, + "language_loss": 0.686019, + "learning_rate": 9.822425183905902e-08, + "loss": 0.70798826, + "num_input_tokens_seen": 162365345, + "step": 7508, + "time_per_iteration": 2.840862274169922 + }, + { + "auxiliary_loss_clip": 0.01087206, + "auxiliary_loss_mlp": 0.01079359, + "balance_loss_clip": 1.01718616, + "balance_loss_mlp": 1.00039434, + "epoch": 0.902903865808934, + "flos": 63717453244800.0, + "grad_norm": 0.924995323829806, + "language_loss": 0.75178266, + "learning_rate": 9.798327668408823e-08, + "loss": 0.77344829, + "num_input_tokens_seen": 162426980, + "step": 7509, + "time_per_iteration": 3.416647434234619 + }, + { + "auxiliary_loss_clip": 0.01135485, + "auxiliary_loss_mlp": 0.01084391, + "balance_loss_clip": 1.02570391, + "balance_loss_mlp": 1.00423491, + "epoch": 0.9030241086995732, + "flos": 23804034600960.0, + "grad_norm": 1.854531221330711, + "language_loss": 0.6863184, + "learning_rate": 9.774259006354158e-08, + "loss": 0.70851719, + "num_input_tokens_seen": 162447050, + "step": 7510, + "time_per_iteration": 2.6103179454803467 + }, + { + "auxiliary_loss_clip": 0.0111844, + "auxiliary_loss_mlp": 0.01082918, + "balance_loss_clip": 1.0256691, + "balance_loss_mlp": 1.00295281, + "epoch": 0.9031443515902122, + "flos": 26395887248640.0, + "grad_norm": 7.854305161586254, + "language_loss": 0.76282734, + "learning_rate": 9.750219201393184e-08, + "loss": 0.78484088, + "num_input_tokens_seen": 162467015, + "step": 7511, + "time_per_iteration": 2.821716547012329 + }, + { + "auxiliary_loss_clip": 0.01123444, + "auxiliary_loss_mlp": 0.01083928, + "balance_loss_clip": 1.02363944, + "balance_loss_mlp": 1.00381947, + "epoch": 0.9032645944808513, + "flos": 24939350749440.0, + "grad_norm": 2.1196369378183544, + "language_loss": 0.77531874, + "learning_rate": 9.726208257172697e-08, + "loss": 0.79739249, + "num_input_tokens_seen": 162488710, + "step": 7512, + "time_per_iteration": 2.7069091796875 + }, + { + "auxiliary_loss_clip": 0.0113505, + "auxiliary_loss_mlp": 0.01083065, + "balance_loss_clip": 1.0261426, + "balance_loss_mlp": 1.00286138, + "epoch": 0.9033848373714904, + "flos": 21178821196800.0, + "grad_norm": 1.928881560108466, + "language_loss": 0.74670511, + "learning_rate": 9.702226177335115e-08, + "loss": 0.76888627, + "num_input_tokens_seen": 162507205, + "step": 7513, + "time_per_iteration": 2.6946358680725098 + }, + { + "auxiliary_loss_clip": 0.01108916, + "auxiliary_loss_mlp": 0.01084305, + "balance_loss_clip": 1.02342093, + "balance_loss_mlp": 1.00414836, + "epoch": 0.9035050802621295, + "flos": 26286359702400.0, + "grad_norm": 1.7274971370453354, + "language_loss": 0.72298437, + "learning_rate": 9.67827296551853e-08, + "loss": 0.74491656, + "num_input_tokens_seen": 162528490, + "step": 7514, + "time_per_iteration": 3.620626449584961 + }, + { + "auxiliary_loss_clip": 0.01119299, + "auxiliary_loss_mlp": 0.00872855, + "balance_loss_clip": 1.02613413, + "balance_loss_mlp": 1.00010669, + "epoch": 0.9036253231527686, + "flos": 24204546224640.0, + "grad_norm": 1.8810853724979655, + "language_loss": 0.68403071, + "learning_rate": 9.65434862535659e-08, + "loss": 0.70395231, + "num_input_tokens_seen": 162547860, + "step": 7515, + "time_per_iteration": 3.7053403854370117 + }, + { + "auxiliary_loss_clip": 0.0111472, + "auxiliary_loss_mlp": 0.01083701, + "balance_loss_clip": 1.02250457, + "balance_loss_mlp": 1.00359273, + "epoch": 0.9037455660434077, + "flos": 18072655660800.0, + "grad_norm": 5.653142536155468, + "language_loss": 0.65233833, + "learning_rate": 9.630453160478635e-08, + "loss": 0.67432255, + "num_input_tokens_seen": 162563215, + "step": 7516, + "time_per_iteration": 2.7127132415771484 + }, + { + "auxiliary_loss_clip": 0.01097875, + "auxiliary_loss_mlp": 0.0108405, + "balance_loss_clip": 1.02278924, + "balance_loss_mlp": 1.00403619, + "epoch": 0.9038658089340468, + "flos": 24060795995520.0, + "grad_norm": 1.5740991716123343, + "language_loss": 0.82492423, + "learning_rate": 9.60658657450959e-08, + "loss": 0.84674346, + "num_input_tokens_seen": 162583515, + "step": 7517, + "time_per_iteration": 2.945405960083008 + }, + { + "auxiliary_loss_clip": 0.01107852, + "auxiliary_loss_mlp": 0.01083397, + "balance_loss_clip": 1.02207971, + "balance_loss_mlp": 1.00338411, + "epoch": 0.9039860518246858, + "flos": 21834298535040.0, + "grad_norm": 1.6967250391479525, + "language_loss": 0.79615831, + "learning_rate": 9.582748871069979e-08, + "loss": 0.81807089, + "num_input_tokens_seen": 162602955, + "step": 7518, + "time_per_iteration": 2.719327449798584 + }, + { + "auxiliary_loss_clip": 0.01116183, + "auxiliary_loss_mlp": 0.00872885, + "balance_loss_clip": 1.02357531, + "balance_loss_mlp": 1.00007391, + "epoch": 0.904106294715325, + "flos": 26614870513920.0, + "grad_norm": 3.027404614551698, + "language_loss": 0.83435893, + "learning_rate": 9.558940053775954e-08, + "loss": 0.8542496, + "num_input_tokens_seen": 162621595, + "step": 7519, + "time_per_iteration": 4.544687032699585 + }, + { + "auxiliary_loss_clip": 0.01125442, + "auxiliary_loss_mlp": 0.01084517, + "balance_loss_clip": 1.02570796, + "balance_loss_mlp": 1.0044086, + "epoch": 0.904226537605964, + "flos": 17785693906560.0, + "grad_norm": 1.7267181801847604, + "language_loss": 0.67905146, + "learning_rate": 9.535160126239294e-08, + "loss": 0.70115107, + "num_input_tokens_seen": 162638220, + "step": 7520, + "time_per_iteration": 2.600268602371216 + }, + { + "auxiliary_loss_clip": 0.0112378, + "auxiliary_loss_mlp": 0.01083544, + "balance_loss_clip": 1.02389765, + "balance_loss_mlp": 1.0034833, + "epoch": 0.9043467804966031, + "flos": 24790428961920.0, + "grad_norm": 1.497602738903209, + "language_loss": 0.70491207, + "learning_rate": 9.511409092067424e-08, + "loss": 0.72698534, + "num_input_tokens_seen": 162658575, + "step": 7521, + "time_per_iteration": 2.6996195316314697 + }, + { + "auxiliary_loss_clip": 0.01112982, + "auxiliary_loss_mlp": 0.0108373, + "balance_loss_clip": 1.02284002, + "balance_loss_mlp": 1.00357389, + "epoch": 0.9044670233872423, + "flos": 22632125472000.0, + "grad_norm": 1.7097538738556233, + "language_loss": 0.6746586, + "learning_rate": 9.487686954863327e-08, + "loss": 0.69662571, + "num_input_tokens_seen": 162678295, + "step": 7522, + "time_per_iteration": 2.72812557220459 + }, + { + "auxiliary_loss_clip": 0.01124706, + "auxiliary_loss_mlp": 0.01084057, + "balance_loss_clip": 1.0249176, + "balance_loss_mlp": 1.00394797, + "epoch": 0.9045872662778813, + "flos": 23771320289280.0, + "grad_norm": 1.9311095833318697, + "language_loss": 0.77295494, + "learning_rate": 9.46399371822566e-08, + "loss": 0.79504257, + "num_input_tokens_seen": 162698070, + "step": 7523, + "time_per_iteration": 2.694420337677002 + }, + { + "auxiliary_loss_clip": 0.01135279, + "auxiliary_loss_mlp": 0.0108317, + "balance_loss_clip": 1.02610159, + "balance_loss_mlp": 1.0031091, + "epoch": 0.9047075091685204, + "flos": 15191039998080.0, + "grad_norm": 1.8209181769909886, + "language_loss": 0.7214815, + "learning_rate": 9.440329385748657e-08, + "loss": 0.74366599, + "num_input_tokens_seen": 162715140, + "step": 7524, + "time_per_iteration": 2.6640820503234863 + }, + { + "auxiliary_loss_clip": 0.01108222, + "auxiliary_loss_mlp": 0.01083379, + "balance_loss_clip": 1.02534509, + "balance_loss_mlp": 1.00341344, + "epoch": 0.9048277520591596, + "flos": 18003707504640.0, + "grad_norm": 1.6820048859125807, + "language_loss": 0.70539701, + "learning_rate": 9.416693961022137e-08, + "loss": 0.72731304, + "num_input_tokens_seen": 162733390, + "step": 7525, + "time_per_iteration": 2.7149574756622314 + }, + { + "auxiliary_loss_clip": 0.01087544, + "auxiliary_loss_mlp": 0.01084586, + "balance_loss_clip": 1.02186203, + "balance_loss_mlp": 1.00452518, + "epoch": 0.9049479949497986, + "flos": 21872471713920.0, + "grad_norm": 1.6957096003200427, + "language_loss": 0.77181494, + "learning_rate": 9.393087447631654e-08, + "loss": 0.79353619, + "num_input_tokens_seen": 162751670, + "step": 7526, + "time_per_iteration": 2.8515241146087646 + }, + { + "auxiliary_loss_clip": 0.01099407, + "auxiliary_loss_mlp": 0.0108388, + "balance_loss_clip": 1.02412474, + "balance_loss_mlp": 1.00391471, + "epoch": 0.9050682378404377, + "flos": 20773928113920.0, + "grad_norm": 1.6352749206066433, + "language_loss": 0.72696447, + "learning_rate": 9.36950984915823e-08, + "loss": 0.74879736, + "num_input_tokens_seen": 162770025, + "step": 7527, + "time_per_iteration": 2.708493232727051 + }, + { + "auxiliary_loss_clip": 0.01135391, + "auxiliary_loss_mlp": 0.01084587, + "balance_loss_clip": 1.02626622, + "balance_loss_mlp": 1.00438285, + "epoch": 0.9051884807310768, + "flos": 21580015178880.0, + "grad_norm": 1.5757313885505861, + "language_loss": 0.69130069, + "learning_rate": 9.345961169178607e-08, + "loss": 0.7135005, + "num_input_tokens_seen": 162789710, + "step": 7528, + "time_per_iteration": 2.612210273742676 + }, + { + "auxiliary_loss_clip": 0.01097111, + "auxiliary_loss_mlp": 0.01084069, + "balance_loss_clip": 1.02171421, + "balance_loss_mlp": 1.00391245, + "epoch": 0.9053087236217159, + "flos": 21908059113600.0, + "grad_norm": 1.4857415797291196, + "language_loss": 0.72955441, + "learning_rate": 9.322441411265081e-08, + "loss": 0.75136626, + "num_input_tokens_seen": 162810695, + "step": 7529, + "time_per_iteration": 2.7162444591522217 + }, + { + "auxiliary_loss_clip": 0.01116556, + "auxiliary_loss_mlp": 0.01084509, + "balance_loss_clip": 1.02462506, + "balance_loss_mlp": 1.00435221, + "epoch": 0.9054289665123549, + "flos": 17055809544960.0, + "grad_norm": 2.3753195495506536, + "language_loss": 0.73669189, + "learning_rate": 9.298950578985554e-08, + "loss": 0.75870252, + "num_input_tokens_seen": 162827770, + "step": 7530, + "time_per_iteration": 2.699532985687256 + }, + { + "auxiliary_loss_clip": 0.01117971, + "auxiliary_loss_mlp": 0.0087297, + "balance_loss_clip": 1.02455187, + "balance_loss_mlp": 1.00006342, + "epoch": 0.905549209402994, + "flos": 20777268078720.0, + "grad_norm": 1.6689122991200491, + "language_loss": 0.7092945, + "learning_rate": 9.275488675903665e-08, + "loss": 0.72920394, + "num_input_tokens_seen": 162846715, + "step": 7531, + "time_per_iteration": 2.683173418045044 + }, + { + "auxiliary_loss_clip": 0.01091501, + "auxiliary_loss_mlp": 0.01083388, + "balance_loss_clip": 1.01960611, + "balance_loss_mlp": 1.00318396, + "epoch": 0.9056694522936332, + "flos": 21686813291520.0, + "grad_norm": 1.894501802911085, + "language_loss": 0.73585188, + "learning_rate": 9.252055705578454e-08, + "loss": 0.75760078, + "num_input_tokens_seen": 162866215, + "step": 7532, + "time_per_iteration": 2.8435494899749756 + }, + { + "auxiliary_loss_clip": 0.01125298, + "auxiliary_loss_mlp": 0.01083823, + "balance_loss_clip": 1.02447164, + "balance_loss_mlp": 1.00380921, + "epoch": 0.9057896951842722, + "flos": 29569133433600.0, + "grad_norm": 1.6070396366820225, + "language_loss": 0.72130871, + "learning_rate": 9.228651671564747e-08, + "loss": 0.74339986, + "num_input_tokens_seen": 162888245, + "step": 7533, + "time_per_iteration": 2.7671566009521484 + }, + { + "auxiliary_loss_clip": 0.01090144, + "auxiliary_loss_mlp": 0.01085281, + "balance_loss_clip": 1.022434, + "balance_loss_mlp": 1.0052197, + "epoch": 0.9059099380749113, + "flos": 27892248952320.0, + "grad_norm": 1.493424834621116, + "language_loss": 0.78007972, + "learning_rate": 9.205276577412901e-08, + "loss": 0.80183393, + "num_input_tokens_seen": 162911025, + "step": 7534, + "time_per_iteration": 2.860386848449707 + }, + { + "auxiliary_loss_clip": 0.01118062, + "auxiliary_loss_mlp": 0.00872909, + "balance_loss_clip": 1.02498865, + "balance_loss_mlp": 1.00005269, + "epoch": 0.9060301809655504, + "flos": 17748993185280.0, + "grad_norm": 2.210465966769239, + "language_loss": 0.7724241, + "learning_rate": 9.181930426668905e-08, + "loss": 0.79233384, + "num_input_tokens_seen": 162927820, + "step": 7535, + "time_per_iteration": 2.8352224826812744 + }, + { + "auxiliary_loss_clip": 0.01079552, + "auxiliary_loss_mlp": 0.01083626, + "balance_loss_clip": 1.0223825, + "balance_loss_mlp": 1.0035646, + "epoch": 0.9061504238561895, + "flos": 31759432963200.0, + "grad_norm": 2.1257048065868847, + "language_loss": 0.67461723, + "learning_rate": 9.158613222874346e-08, + "loss": 0.69624907, + "num_input_tokens_seen": 162949445, + "step": 7536, + "time_per_iteration": 2.8584344387054443 + }, + { + "auxiliary_loss_clip": 0.01115055, + "auxiliary_loss_mlp": 0.0108324, + "balance_loss_clip": 1.02350152, + "balance_loss_mlp": 1.003227, + "epoch": 0.9062706667468285, + "flos": 20048066075520.0, + "grad_norm": 1.5893419679607177, + "language_loss": 0.82089543, + "learning_rate": 9.135324969566394e-08, + "loss": 0.8428784, + "num_input_tokens_seen": 162968945, + "step": 7537, + "time_per_iteration": 2.79543399810791 + }, + { + "auxiliary_loss_clip": 0.01126302, + "auxiliary_loss_mlp": 0.01083996, + "balance_loss_clip": 1.02522087, + "balance_loss_mlp": 1.0039351, + "epoch": 0.9063909096374677, + "flos": 18437292576000.0, + "grad_norm": 1.775267061603499, + "language_loss": 0.75583398, + "learning_rate": 9.112065670277913e-08, + "loss": 0.77793694, + "num_input_tokens_seen": 162985310, + "step": 7538, + "time_per_iteration": 2.6732096672058105 + }, + { + "auxiliary_loss_clip": 0.01116381, + "auxiliary_loss_mlp": 0.01084287, + "balance_loss_clip": 1.02361512, + "balance_loss_mlp": 1.0041784, + "epoch": 0.9065111525281068, + "flos": 33547353361920.0, + "grad_norm": 1.7746384945935796, + "language_loss": 0.73063564, + "learning_rate": 9.088835328537303e-08, + "loss": 0.75264227, + "num_input_tokens_seen": 163006900, + "step": 7539, + "time_per_iteration": 2.8912253379821777 + }, + { + "auxiliary_loss_clip": 0.01118631, + "auxiliary_loss_mlp": 0.01083102, + "balance_loss_clip": 1.02613997, + "balance_loss_mlp": 1.00304151, + "epoch": 0.9066313954187458, + "flos": 23367863750400.0, + "grad_norm": 1.9646265671350647, + "language_loss": 0.71285605, + "learning_rate": 9.065633947868568e-08, + "loss": 0.73487341, + "num_input_tokens_seen": 163026505, + "step": 7540, + "time_per_iteration": 3.5846240520477295 + }, + { + "auxiliary_loss_clip": 0.01103896, + "auxiliary_loss_mlp": 0.00872839, + "balance_loss_clip": 1.02309287, + "balance_loss_mlp": 1.00007677, + "epoch": 0.906751638309385, + "flos": 26249623067520.0, + "grad_norm": 2.0123311377956665, + "language_loss": 0.79807627, + "learning_rate": 9.042461531791379e-08, + "loss": 0.81784368, + "num_input_tokens_seen": 163044925, + "step": 7541, + "time_per_iteration": 3.7924318313598633 + }, + { + "auxiliary_loss_clip": 0.0113363, + "auxiliary_loss_mlp": 0.01083915, + "balance_loss_clip": 1.02492452, + "balance_loss_mlp": 1.00385404, + "epoch": 0.906871881200024, + "flos": 16544477485440.0, + "grad_norm": 2.12834280856852, + "language_loss": 0.77812874, + "learning_rate": 9.019318083820903e-08, + "loss": 0.80030417, + "num_input_tokens_seen": 163063505, + "step": 7542, + "time_per_iteration": 2.6417200565338135 + }, + { + "auxiliary_loss_clip": 0.01124423, + "auxiliary_loss_mlp": 0.01083027, + "balance_loss_clip": 1.02475238, + "balance_loss_mlp": 1.00287032, + "epoch": 0.9069921240906631, + "flos": 24605129675520.0, + "grad_norm": 1.6142836985421085, + "language_loss": 0.85271108, + "learning_rate": 8.996203607468045e-08, + "loss": 0.8747856, + "num_input_tokens_seen": 163082505, + "step": 7543, + "time_per_iteration": 2.668438673019409 + }, + { + "auxiliary_loss_clip": 0.01127165, + "auxiliary_loss_mlp": 0.01083576, + "balance_loss_clip": 1.02552652, + "balance_loss_mlp": 1.00341976, + "epoch": 0.9071123669813023, + "flos": 25374731500800.0, + "grad_norm": 1.3999890570101852, + "language_loss": 0.75571263, + "learning_rate": 8.973118106239241e-08, + "loss": 0.77781999, + "num_input_tokens_seen": 163105110, + "step": 7544, + "time_per_iteration": 4.651357173919678 + }, + { + "auxiliary_loss_clip": 0.01090349, + "auxiliary_loss_mlp": 0.01084716, + "balance_loss_clip": 1.02260303, + "balance_loss_mlp": 1.00460744, + "epoch": 0.9072326098719413, + "flos": 26725798690560.0, + "grad_norm": 2.6308709731325615, + "language_loss": 0.94927728, + "learning_rate": 8.95006158363656e-08, + "loss": 0.97102785, + "num_input_tokens_seen": 163125295, + "step": 7545, + "time_per_iteration": 2.9000296592712402 + }, + { + "auxiliary_loss_clip": 0.01120457, + "auxiliary_loss_mlp": 0.01083823, + "balance_loss_clip": 1.02184987, + "balance_loss_mlp": 1.00352371, + "epoch": 0.9073528527625804, + "flos": 23878800760320.0, + "grad_norm": 1.70448665513583, + "language_loss": 0.77444583, + "learning_rate": 8.9270340431576e-08, + "loss": 0.79648864, + "num_input_tokens_seen": 163144385, + "step": 7546, + "time_per_iteration": 2.74888014793396 + }, + { + "auxiliary_loss_clip": 0.01125201, + "auxiliary_loss_mlp": 0.01083578, + "balance_loss_clip": 1.02413297, + "balance_loss_mlp": 1.00346923, + "epoch": 0.9074730956532195, + "flos": 37852144767360.0, + "grad_norm": 1.8905945873853034, + "language_loss": 0.73672348, + "learning_rate": 8.904035488295658e-08, + "loss": 0.7588113, + "num_input_tokens_seen": 163163885, + "step": 7547, + "time_per_iteration": 2.8923535346984863 + }, + { + "auxiliary_loss_clip": 0.01104312, + "auxiliary_loss_mlp": 0.00872852, + "balance_loss_clip": 1.01728582, + "balance_loss_mlp": 1.0010823, + "epoch": 0.9075933385438586, + "flos": 65173307385600.0, + "grad_norm": 0.6591932193197744, + "language_loss": 0.53245664, + "learning_rate": 8.881065922539632e-08, + "loss": 0.55222827, + "num_input_tokens_seen": 163224325, + "step": 7548, + "time_per_iteration": 3.2505757808685303 + }, + { + "auxiliary_loss_clip": 0.01097019, + "auxiliary_loss_mlp": 0.01083622, + "balance_loss_clip": 1.02146602, + "balance_loss_mlp": 1.00360882, + "epoch": 0.9077135814344977, + "flos": 19931571290880.0, + "grad_norm": 1.5925591370015577, + "language_loss": 0.73369545, + "learning_rate": 8.85812534937389e-08, + "loss": 0.75550187, + "num_input_tokens_seen": 163242425, + "step": 7549, + "time_per_iteration": 2.792781352996826 + }, + { + "auxiliary_loss_clip": 0.01110793, + "auxiliary_loss_mlp": 0.01084519, + "balance_loss_clip": 1.02609301, + "balance_loss_mlp": 1.00436258, + "epoch": 0.9078338243251368, + "flos": 17529650784000.0, + "grad_norm": 3.2717154466876512, + "language_loss": 0.67587221, + "learning_rate": 8.835213772278583e-08, + "loss": 0.69782531, + "num_input_tokens_seen": 163259280, + "step": 7550, + "time_per_iteration": 2.7006895542144775 + }, + { + "auxiliary_loss_clip": 0.01098903, + "auxiliary_loss_mlp": 0.01083304, + "balance_loss_clip": 1.01975739, + "balance_loss_mlp": 1.00319529, + "epoch": 0.9079540672157759, + "flos": 28803410277120.0, + "grad_norm": 1.818110339123714, + "language_loss": 0.790429, + "learning_rate": 8.812331194729373e-08, + "loss": 0.81225109, + "num_input_tokens_seen": 163278925, + "step": 7551, + "time_per_iteration": 2.7798008918762207 + }, + { + "auxiliary_loss_clip": 0.01136783, + "auxiliary_loss_mlp": 0.01084202, + "balance_loss_clip": 1.02762294, + "balance_loss_mlp": 1.00399756, + "epoch": 0.9080743101064149, + "flos": 23513840622720.0, + "grad_norm": 1.8641159783180477, + "language_loss": 0.71702147, + "learning_rate": 8.789477620197461e-08, + "loss": 0.73923135, + "num_input_tokens_seen": 163298450, + "step": 7552, + "time_per_iteration": 2.6372032165527344 + }, + { + "auxiliary_loss_clip": 0.01116845, + "auxiliary_loss_mlp": 0.01084465, + "balance_loss_clip": 1.02472818, + "balance_loss_mlp": 1.00435615, + "epoch": 0.9081945529970541, + "flos": 22778102344320.0, + "grad_norm": 2.130134966607325, + "language_loss": 0.79362631, + "learning_rate": 8.766653052149831e-08, + "loss": 0.81563944, + "num_input_tokens_seen": 163313635, + "step": 7553, + "time_per_iteration": 2.7335643768310547 + }, + { + "auxiliary_loss_clip": 0.01115008, + "auxiliary_loss_mlp": 0.01083773, + "balance_loss_clip": 1.02405739, + "balance_loss_mlp": 1.00356948, + "epoch": 0.9083147958876931, + "flos": 18873714821760.0, + "grad_norm": 2.181604327974221, + "language_loss": 0.74187684, + "learning_rate": 8.743857494048823e-08, + "loss": 0.76386464, + "num_input_tokens_seen": 163330450, + "step": 7554, + "time_per_iteration": 2.797152042388916 + }, + { + "auxiliary_loss_clip": 0.01108968, + "auxiliary_loss_mlp": 0.01085717, + "balance_loss_clip": 1.02533269, + "balance_loss_mlp": 1.00560856, + "epoch": 0.9084350387783322, + "flos": 18909374048640.0, + "grad_norm": 1.7327123172190426, + "language_loss": 0.6266591, + "learning_rate": 8.721090949352605e-08, + "loss": 0.64860594, + "num_input_tokens_seen": 163346690, + "step": 7555, + "time_per_iteration": 2.8925304412841797 + }, + { + "auxiliary_loss_clip": 0.01112014, + "auxiliary_loss_mlp": 0.01085061, + "balance_loss_clip": 1.0267818, + "balance_loss_mlp": 1.00480914, + "epoch": 0.9085552816689714, + "flos": 20595488325120.0, + "grad_norm": 2.067509972436272, + "language_loss": 0.72842383, + "learning_rate": 8.698353421514793e-08, + "loss": 0.75039464, + "num_input_tokens_seen": 163365065, + "step": 7556, + "time_per_iteration": 2.6809885501861572 + }, + { + "auxiliary_loss_clip": 0.01124242, + "auxiliary_loss_mlp": 0.01083765, + "balance_loss_clip": 1.02444196, + "balance_loss_mlp": 1.00370431, + "epoch": 0.9086755245596104, + "flos": 18113163223680.0, + "grad_norm": 2.175217007562184, + "language_loss": 0.80282348, + "learning_rate": 8.67564491398467e-08, + "loss": 0.82490355, + "num_input_tokens_seen": 163382070, + "step": 7557, + "time_per_iteration": 2.6170411109924316 + }, + { + "auxiliary_loss_clip": 0.01126177, + "auxiliary_loss_mlp": 0.01083097, + "balance_loss_clip": 1.02529573, + "balance_loss_mlp": 1.00308347, + "epoch": 0.9087957674502495, + "flos": 19129793857920.0, + "grad_norm": 1.6799221007961596, + "language_loss": 0.73533684, + "learning_rate": 8.652965430207104e-08, + "loss": 0.7574296, + "num_input_tokens_seen": 163399975, + "step": 7558, + "time_per_iteration": 2.63394832611084 + }, + { + "auxiliary_loss_clip": 0.01125882, + "auxiliary_loss_mlp": 0.01084549, + "balance_loss_clip": 1.02497745, + "balance_loss_mlp": 1.00444043, + "epoch": 0.9089160103408886, + "flos": 18109930999680.0, + "grad_norm": 1.8334103168352702, + "language_loss": 0.65634042, + "learning_rate": 8.630314973622521e-08, + "loss": 0.67844474, + "num_input_tokens_seen": 163417520, + "step": 7559, + "time_per_iteration": 2.694856882095337 + }, + { + "auxiliary_loss_clip": 0.01124424, + "auxiliary_loss_mlp": 0.01083775, + "balance_loss_clip": 1.02522373, + "balance_loss_mlp": 1.00376165, + "epoch": 0.9090362532315277, + "flos": 33364855336320.0, + "grad_norm": 1.8755095406039692, + "language_loss": 0.70789677, + "learning_rate": 8.607693547666995e-08, + "loss": 0.72997874, + "num_input_tokens_seen": 163440060, + "step": 7560, + "time_per_iteration": 2.7570953369140625 + }, + { + "auxiliary_loss_clip": 0.01087935, + "auxiliary_loss_mlp": 0.01078888, + "balance_loss_clip": 1.01718163, + "balance_loss_mlp": 0.99992335, + "epoch": 0.9091564961221668, + "flos": 71480585082240.0, + "grad_norm": 0.8881691257524446, + "language_loss": 0.5795669, + "learning_rate": 8.585101155772201e-08, + "loss": 0.60123515, + "num_input_tokens_seen": 163502180, + "step": 7561, + "time_per_iteration": 3.4029622077941895 + }, + { + "auxiliary_loss_clip": 0.0111801, + "auxiliary_loss_mlp": 0.01083758, + "balance_loss_clip": 1.02451241, + "balance_loss_mlp": 1.00360215, + "epoch": 0.9092767390128058, + "flos": 24712574232960.0, + "grad_norm": 2.7046698816594867, + "language_loss": 0.68455637, + "learning_rate": 8.562537801365377e-08, + "loss": 0.70657408, + "num_input_tokens_seen": 163521915, + "step": 7562, + "time_per_iteration": 2.751939058303833 + }, + { + "auxiliary_loss_clip": 0.01134849, + "auxiliary_loss_mlp": 0.01084131, + "balance_loss_clip": 1.02558827, + "balance_loss_mlp": 1.00392723, + "epoch": 0.909396981903445, + "flos": 23586487879680.0, + "grad_norm": 1.8839866074342972, + "language_loss": 0.70018852, + "learning_rate": 8.540003487869362e-08, + "loss": 0.72237831, + "num_input_tokens_seen": 163543585, + "step": 7563, + "time_per_iteration": 2.6375012397766113 + }, + { + "auxiliary_loss_clip": 0.01098733, + "auxiliary_loss_mlp": 0.01082969, + "balance_loss_clip": 1.0226953, + "balance_loss_mlp": 1.00281286, + "epoch": 0.909517224794084, + "flos": 23404169422080.0, + "grad_norm": 1.8776819341213895, + "language_loss": 0.79537332, + "learning_rate": 8.517498218702557e-08, + "loss": 0.81719029, + "num_input_tokens_seen": 163561515, + "step": 7564, + "time_per_iteration": 2.772462844848633 + }, + { + "auxiliary_loss_clip": 0.01107407, + "auxiliary_loss_mlp": 0.01083849, + "balance_loss_clip": 1.02357376, + "balance_loss_mlp": 1.00378776, + "epoch": 0.9096374676847231, + "flos": 19208618254080.0, + "grad_norm": 1.6133688358801497, + "language_loss": 0.69201183, + "learning_rate": 8.49502199727905e-08, + "loss": 0.71392441, + "num_input_tokens_seen": 163579540, + "step": 7565, + "time_per_iteration": 3.592820644378662 + }, + { + "auxiliary_loss_clip": 0.01126047, + "auxiliary_loss_mlp": 0.01083371, + "balance_loss_clip": 1.02462316, + "balance_loss_mlp": 1.00326276, + "epoch": 0.9097577105753623, + "flos": 33292495388160.0, + "grad_norm": 2.0992204961954433, + "language_loss": 0.66159797, + "learning_rate": 8.472574827008428e-08, + "loss": 0.68369216, + "num_input_tokens_seen": 163600425, + "step": 7566, + "time_per_iteration": 2.7576024532318115 + }, + { + "auxiliary_loss_clip": 0.01124451, + "auxiliary_loss_mlp": 0.01083136, + "balance_loss_clip": 1.02401209, + "balance_loss_mlp": 1.00302744, + "epoch": 0.9098779534660013, + "flos": 21906443001600.0, + "grad_norm": 1.616716612555276, + "language_loss": 0.83782876, + "learning_rate": 8.450156711295942e-08, + "loss": 0.85990459, + "num_input_tokens_seen": 163620595, + "step": 7567, + "time_per_iteration": 3.7059385776519775 + }, + { + "auxiliary_loss_clip": 0.0111515, + "auxiliary_loss_mlp": 0.01084988, + "balance_loss_clip": 1.0250541, + "balance_loss_mlp": 1.00487971, + "epoch": 0.9099981963566404, + "flos": 25730354102400.0, + "grad_norm": 1.9892107416704945, + "language_loss": 0.86497831, + "learning_rate": 8.427767653542383e-08, + "loss": 0.8869797, + "num_input_tokens_seen": 163635765, + "step": 7568, + "time_per_iteration": 2.7517027854919434 + }, + { + "auxiliary_loss_clip": 0.01097096, + "auxiliary_loss_mlp": 0.01083654, + "balance_loss_clip": 1.02305722, + "balance_loss_mlp": 1.00368857, + "epoch": 0.9101184392472795, + "flos": 21069437304960.0, + "grad_norm": 2.240538345257976, + "language_loss": 0.70600104, + "learning_rate": 8.405407657144125e-08, + "loss": 0.72780859, + "num_input_tokens_seen": 163654925, + "step": 7569, + "time_per_iteration": 3.729684352874756 + }, + { + "auxiliary_loss_clip": 0.01116945, + "auxiliary_loss_mlp": 0.01084193, + "balance_loss_clip": 1.02463293, + "balance_loss_mlp": 1.00427485, + "epoch": 0.9102386821379186, + "flos": 24752614919040.0, + "grad_norm": 9.722754729317737, + "language_loss": 0.72389489, + "learning_rate": 8.383076725493232e-08, + "loss": 0.74590635, + "num_input_tokens_seen": 163672245, + "step": 7570, + "time_per_iteration": 3.653956890106201 + }, + { + "auxiliary_loss_clip": 0.01125201, + "auxiliary_loss_mlp": 0.01083789, + "balance_loss_clip": 1.02440274, + "balance_loss_mlp": 1.00368059, + "epoch": 0.9103589250285576, + "flos": 22562818179840.0, + "grad_norm": 4.199090624697263, + "language_loss": 0.67627156, + "learning_rate": 8.360774861977216e-08, + "loss": 0.6983614, + "num_input_tokens_seen": 163691365, + "step": 7571, + "time_per_iteration": 2.66131329536438 + }, + { + "auxiliary_loss_clip": 0.01116866, + "auxiliary_loss_mlp": 0.01083761, + "balance_loss_clip": 1.02348518, + "balance_loss_mlp": 1.00374746, + "epoch": 0.9104791679191968, + "flos": 25373474524800.0, + "grad_norm": 1.758541019468913, + "language_loss": 0.74109, + "learning_rate": 8.338502069979281e-08, + "loss": 0.76309633, + "num_input_tokens_seen": 163711675, + "step": 7572, + "time_per_iteration": 2.8070638179779053 + }, + { + "auxiliary_loss_clip": 0.01126161, + "auxiliary_loss_mlp": 0.0108416, + "balance_loss_clip": 1.02481735, + "balance_loss_mlp": 1.00405109, + "epoch": 0.9105994108098359, + "flos": 14426681558400.0, + "grad_norm": 2.26483791002991, + "language_loss": 0.79752159, + "learning_rate": 8.316258352878214e-08, + "loss": 0.81962478, + "num_input_tokens_seen": 163728095, + "step": 7573, + "time_per_iteration": 2.7125816345214844 + }, + { + "auxiliary_loss_clip": 0.01126863, + "auxiliary_loss_mlp": 0.01084467, + "balance_loss_clip": 1.0256511, + "balance_loss_mlp": 1.00435829, + "epoch": 0.9107196537004749, + "flos": 26718292748160.0, + "grad_norm": 6.7776977046892775, + "language_loss": 0.71511704, + "learning_rate": 8.294043714048338e-08, + "loss": 0.7372303, + "num_input_tokens_seen": 163747175, + "step": 7574, + "time_per_iteration": 2.7602827548980713 + }, + { + "auxiliary_loss_clip": 0.01096519, + "auxiliary_loss_mlp": 0.01078941, + "balance_loss_clip": 1.01767659, + "balance_loss_mlp": 0.99997634, + "epoch": 0.9108398965911141, + "flos": 66532634703360.0, + "grad_norm": 0.7525216404261184, + "language_loss": 0.60494936, + "learning_rate": 8.271858156859624e-08, + "loss": 0.62670398, + "num_input_tokens_seen": 163812545, + "step": 7575, + "time_per_iteration": 3.344956159591675 + }, + { + "auxiliary_loss_clip": 0.0113494, + "auxiliary_loss_mlp": 0.01083435, + "balance_loss_clip": 1.02602851, + "balance_loss_mlp": 1.00332642, + "epoch": 0.9109601394817531, + "flos": 25411073086080.0, + "grad_norm": 1.5856108379027192, + "language_loss": 0.73350668, + "learning_rate": 8.249701684677557e-08, + "loss": 0.75569046, + "num_input_tokens_seen": 163833870, + "step": 7576, + "time_per_iteration": 2.7015862464904785 + }, + { + "auxiliary_loss_clip": 0.01126535, + "auxiliary_loss_mlp": 0.01084448, + "balance_loss_clip": 1.02646065, + "balance_loss_mlp": 1.00443435, + "epoch": 0.9110803823723922, + "flos": 22747794243840.0, + "grad_norm": 1.6368407240621448, + "language_loss": 0.81354463, + "learning_rate": 8.227574300863294e-08, + "loss": 0.83565444, + "num_input_tokens_seen": 163854040, + "step": 7577, + "time_per_iteration": 2.761154890060425 + }, + { + "auxiliary_loss_clip": 0.01116749, + "auxiliary_loss_mlp": 0.01085163, + "balance_loss_clip": 1.02514756, + "balance_loss_mlp": 1.00491095, + "epoch": 0.9112006252630314, + "flos": 48469924131840.0, + "grad_norm": 1.7446488391380193, + "language_loss": 0.69616175, + "learning_rate": 8.205476008773548e-08, + "loss": 0.71818089, + "num_input_tokens_seen": 163878040, + "step": 7578, + "time_per_iteration": 3.0240867137908936 + }, + { + "auxiliary_loss_clip": 0.01105285, + "auxiliary_loss_mlp": 0.01085103, + "balance_loss_clip": 1.0237236, + "balance_loss_mlp": 1.00499487, + "epoch": 0.9113208681536704, + "flos": 30009649829760.0, + "grad_norm": 1.9576069350583047, + "language_loss": 0.825032, + "learning_rate": 8.183406811760596e-08, + "loss": 0.84693587, + "num_input_tokens_seen": 163897770, + "step": 7579, + "time_per_iteration": 2.8615658283233643 + }, + { + "auxiliary_loss_clip": 0.01108765, + "auxiliary_loss_mlp": 0.01084104, + "balance_loss_clip": 1.02447999, + "balance_loss_mlp": 1.00404263, + "epoch": 0.9114411110443095, + "flos": 25594971742080.0, + "grad_norm": 1.4957717260769028, + "language_loss": 0.73866874, + "learning_rate": 8.161366713172313e-08, + "loss": 0.76059747, + "num_input_tokens_seen": 163920160, + "step": 7580, + "time_per_iteration": 2.8097174167633057 + }, + { + "auxiliary_loss_clip": 0.01110002, + "auxiliary_loss_mlp": 0.01084784, + "balance_loss_clip": 1.02451956, + "balance_loss_mlp": 1.00453234, + "epoch": 0.9115613539349486, + "flos": 18399729928320.0, + "grad_norm": 3.7726870597487463, + "language_loss": 0.83920443, + "learning_rate": 8.139355716352137e-08, + "loss": 0.86115229, + "num_input_tokens_seen": 163935000, + "step": 7581, + "time_per_iteration": 2.750880002975464 + }, + { + "auxiliary_loss_clip": 0.01101626, + "auxiliary_loss_mlp": 0.0108299, + "balance_loss_clip": 1.02498174, + "balance_loss_mlp": 1.00283408, + "epoch": 0.9116815968255877, + "flos": 21726171619200.0, + "grad_norm": 2.7937933646434345, + "language_loss": 0.69557405, + "learning_rate": 8.117373824639196e-08, + "loss": 0.71742022, + "num_input_tokens_seen": 163955265, + "step": 7582, + "time_per_iteration": 2.6970343589782715 + }, + { + "auxiliary_loss_clip": 0.01112627, + "auxiliary_loss_mlp": 0.010788, + "balance_loss_clip": 1.01747155, + "balance_loss_mlp": 0.99983597, + "epoch": 0.9118018397162267, + "flos": 65363526835200.0, + "grad_norm": 0.7834793916386282, + "language_loss": 0.59280133, + "learning_rate": 8.095421041368067e-08, + "loss": 0.61471564, + "num_input_tokens_seen": 164014680, + "step": 7583, + "time_per_iteration": 3.127122402191162 + }, + { + "auxiliary_loss_clip": 0.01115191, + "auxiliary_loss_mlp": 0.00872883, + "balance_loss_clip": 1.0242455, + "balance_loss_mlp": 1.00013006, + "epoch": 0.9119220826068659, + "flos": 20922885815040.0, + "grad_norm": 2.8645694844261236, + "language_loss": 0.70162761, + "learning_rate": 8.073497369868999e-08, + "loss": 0.72150838, + "num_input_tokens_seen": 164033140, + "step": 7584, + "time_per_iteration": 2.724684000015259 + }, + { + "auxiliary_loss_clip": 0.01117447, + "auxiliary_loss_mlp": 0.01084295, + "balance_loss_clip": 1.02465272, + "balance_loss_mlp": 1.00409138, + "epoch": 0.912042325497505, + "flos": 28366449327360.0, + "grad_norm": 1.549819182180209, + "language_loss": 0.75740921, + "learning_rate": 8.051602813467772e-08, + "loss": 0.77942663, + "num_input_tokens_seen": 164054995, + "step": 7585, + "time_per_iteration": 2.8534321784973145 + }, + { + "auxiliary_loss_clip": 0.01126976, + "auxiliary_loss_mlp": 0.01084234, + "balance_loss_clip": 1.0259335, + "balance_loss_mlp": 1.00412548, + "epoch": 0.912162568388144, + "flos": 17566782468480.0, + "grad_norm": 2.221573456740216, + "language_loss": 0.71168846, + "learning_rate": 8.029737375485756e-08, + "loss": 0.73380059, + "num_input_tokens_seen": 164074225, + "step": 7586, + "time_per_iteration": 2.646906852722168 + }, + { + "auxiliary_loss_clip": 0.01134944, + "auxiliary_loss_mlp": 0.01083759, + "balance_loss_clip": 1.02560306, + "balance_loss_mlp": 1.00369835, + "epoch": 0.9122828112787832, + "flos": 19827897661440.0, + "grad_norm": 1.7483869808799894, + "language_loss": 0.72891825, + "learning_rate": 8.007901059239986e-08, + "loss": 0.75110531, + "num_input_tokens_seen": 164093505, + "step": 7587, + "time_per_iteration": 2.6484713554382324 + }, + { + "auxiliary_loss_clip": 0.01114749, + "auxiliary_loss_mlp": 0.01083018, + "balance_loss_clip": 1.02296305, + "balance_loss_mlp": 1.00295663, + "epoch": 0.9124030541694222, + "flos": 20813789232000.0, + "grad_norm": 1.5681039624544835, + "language_loss": 0.79808342, + "learning_rate": 7.986093868042964e-08, + "loss": 0.82006109, + "num_input_tokens_seen": 164113750, + "step": 7588, + "time_per_iteration": 2.7454116344451904 + }, + { + "auxiliary_loss_clip": 0.01124377, + "auxiliary_loss_mlp": 0.01083801, + "balance_loss_clip": 1.02399039, + "balance_loss_mlp": 1.00369239, + "epoch": 0.9125232970600613, + "flos": 25192305302400.0, + "grad_norm": 1.703038134870428, + "language_loss": 0.68141586, + "learning_rate": 7.964315805202826e-08, + "loss": 0.70349765, + "num_input_tokens_seen": 164134330, + "step": 7589, + "time_per_iteration": 2.742037534713745 + }, + { + "auxiliary_loss_clip": 0.01111474, + "auxiliary_loss_mlp": 0.01084517, + "balance_loss_clip": 1.02051437, + "balance_loss_mlp": 1.00436103, + "epoch": 0.9126435399507005, + "flos": 19719591177600.0, + "grad_norm": 2.0027947053331854, + "language_loss": 0.73218721, + "learning_rate": 7.942566874023304e-08, + "loss": 0.75414717, + "num_input_tokens_seen": 164153515, + "step": 7590, + "time_per_iteration": 3.6574060916900635 + }, + { + "auxiliary_loss_clip": 0.01101885, + "auxiliary_loss_mlp": 0.0108373, + "balance_loss_clip": 1.02532268, + "balance_loss_mlp": 1.00352573, + "epoch": 0.9127637828413395, + "flos": 19573614305280.0, + "grad_norm": 2.020342953238879, + "language_loss": 0.69705093, + "learning_rate": 7.920847077803649e-08, + "loss": 0.71890706, + "num_input_tokens_seen": 164171305, + "step": 7591, + "time_per_iteration": 2.703869581222534 + }, + { + "auxiliary_loss_clip": 0.0109966, + "auxiliary_loss_mlp": 0.01084238, + "balance_loss_clip": 1.02268529, + "balance_loss_mlp": 1.00422478, + "epoch": 0.9128840257319786, + "flos": 20230635928320.0, + "grad_norm": 1.7209999069574118, + "language_loss": 0.81934053, + "learning_rate": 7.899156419838826e-08, + "loss": 0.84117949, + "num_input_tokens_seen": 164190275, + "step": 7592, + "time_per_iteration": 3.8653361797332764 + }, + { + "auxiliary_loss_clip": 0.01107711, + "auxiliary_loss_mlp": 0.0108311, + "balance_loss_clip": 1.0239408, + "balance_loss_mlp": 1.00300133, + "epoch": 0.9130042686226177, + "flos": 24858658846080.0, + "grad_norm": 1.806073360297389, + "language_loss": 0.65545154, + "learning_rate": 7.87749490341918e-08, + "loss": 0.6773597, + "num_input_tokens_seen": 164210550, + "step": 7593, + "time_per_iteration": 2.806709051132202 + }, + { + "auxiliary_loss_clip": 0.01135516, + "auxiliary_loss_mlp": 0.01085414, + "balance_loss_clip": 1.02620184, + "balance_loss_mlp": 1.00525761, + "epoch": 0.9131245115132568, + "flos": 23581747284480.0, + "grad_norm": 2.038919670346816, + "language_loss": 0.8296653, + "learning_rate": 7.855862531830836e-08, + "loss": 0.85187471, + "num_input_tokens_seen": 164226660, + "step": 7594, + "time_per_iteration": 3.584953546524048 + }, + { + "auxiliary_loss_clip": 0.01127104, + "auxiliary_loss_mlp": 0.01084127, + "balance_loss_clip": 1.02571154, + "balance_loss_mlp": 1.0039711, + "epoch": 0.9132447544038959, + "flos": 19931607204480.0, + "grad_norm": 1.697346446036727, + "language_loss": 0.72772819, + "learning_rate": 7.834259308355373e-08, + "loss": 0.7498405, + "num_input_tokens_seen": 164245425, + "step": 7595, + "time_per_iteration": 3.6280007362365723 + }, + { + "auxiliary_loss_clip": 0.01087176, + "auxiliary_loss_mlp": 0.01084094, + "balance_loss_clip": 1.02243543, + "balance_loss_mlp": 1.0039854, + "epoch": 0.9133649972945349, + "flos": 21981747864960.0, + "grad_norm": 2.079174331200823, + "language_loss": 0.75312567, + "learning_rate": 7.812685236269989e-08, + "loss": 0.77483845, + "num_input_tokens_seen": 164264085, + "step": 7596, + "time_per_iteration": 2.8708760738372803 + }, + { + "auxiliary_loss_clip": 0.01083595, + "auxiliary_loss_mlp": 0.01078885, + "balance_loss_clip": 1.02194047, + "balance_loss_mlp": 0.99992067, + "epoch": 0.9134852401851741, + "flos": 71240523511680.0, + "grad_norm": 0.7957453012355701, + "language_loss": 0.58654094, + "learning_rate": 7.791140318847445e-08, + "loss": 0.60816574, + "num_input_tokens_seen": 164322220, + "step": 7597, + "time_per_iteration": 3.3316354751586914 + }, + { + "auxiliary_loss_clip": 0.01109023, + "auxiliary_loss_mlp": 0.01084465, + "balance_loss_clip": 1.02066541, + "balance_loss_mlp": 1.00440371, + "epoch": 0.9136054830758131, + "flos": 23626923615360.0, + "grad_norm": 1.5215537730955968, + "language_loss": 0.80206084, + "learning_rate": 7.769624559356081e-08, + "loss": 0.82399577, + "num_input_tokens_seen": 164345615, + "step": 7598, + "time_per_iteration": 2.768237590789795 + }, + { + "auxiliary_loss_clip": 0.01122814, + "auxiliary_loss_mlp": 0.01085419, + "balance_loss_clip": 1.02334833, + "balance_loss_mlp": 1.00521541, + "epoch": 0.9137257259664522, + "flos": 23438858981760.0, + "grad_norm": 3.4987392222556486, + "language_loss": 0.75626624, + "learning_rate": 7.748137961059842e-08, + "loss": 0.77834857, + "num_input_tokens_seen": 164359595, + "step": 7599, + "time_per_iteration": 2.7339935302734375 + }, + { + "auxiliary_loss_clip": 0.01134392, + "auxiliary_loss_mlp": 0.01083644, + "balance_loss_clip": 1.02572298, + "balance_loss_mlp": 1.00358272, + "epoch": 0.9138459688570914, + "flos": 19127854523520.0, + "grad_norm": 2.553219143908604, + "language_loss": 0.65304524, + "learning_rate": 7.726680527218211e-08, + "loss": 0.67522562, + "num_input_tokens_seen": 164376635, + "step": 7600, + "time_per_iteration": 2.623537302017212 + }, + { + "auxiliary_loss_clip": 0.01134315, + "auxiliary_loss_mlp": 0.01083875, + "balance_loss_clip": 1.02467287, + "balance_loss_mlp": 1.00376618, + "epoch": 0.9139662117477304, + "flos": 46281240714240.0, + "grad_norm": 1.7061027749115114, + "language_loss": 0.75630313, + "learning_rate": 7.70525226108627e-08, + "loss": 0.77848506, + "num_input_tokens_seen": 164400305, + "step": 7601, + "time_per_iteration": 2.856391191482544 + }, + { + "auxiliary_loss_clip": 0.01126405, + "auxiliary_loss_mlp": 0.01084168, + "balance_loss_clip": 1.02584875, + "balance_loss_mlp": 1.00410664, + "epoch": 0.9140864546383695, + "flos": 22273198819200.0, + "grad_norm": 1.6582194991384882, + "language_loss": 0.80060524, + "learning_rate": 7.683853165914666e-08, + "loss": 0.82271093, + "num_input_tokens_seen": 164418075, + "step": 7602, + "time_per_iteration": 2.63480544090271 + }, + { + "auxiliary_loss_clip": 0.01098687, + "auxiliary_loss_mlp": 0.01084494, + "balance_loss_clip": 1.0239954, + "balance_loss_mlp": 1.00438559, + "epoch": 0.9142066975290086, + "flos": 17530009920000.0, + "grad_norm": 1.6719319124884284, + "language_loss": 0.77359605, + "learning_rate": 7.662483244949602e-08, + "loss": 0.79542786, + "num_input_tokens_seen": 164435335, + "step": 7603, + "time_per_iteration": 2.8231921195983887 + }, + { + "auxiliary_loss_clip": 0.01096718, + "auxiliary_loss_mlp": 0.01084904, + "balance_loss_clip": 1.02118325, + "balance_loss_mlp": 1.00474775, + "epoch": 0.9143269404196477, + "flos": 17712148809600.0, + "grad_norm": 2.1556280216523835, + "language_loss": 0.80488044, + "learning_rate": 7.641142501432951e-08, + "loss": 0.82669663, + "num_input_tokens_seen": 164451530, + "step": 7604, + "time_per_iteration": 2.729698419570923 + }, + { + "auxiliary_loss_clip": 0.01116392, + "auxiliary_loss_mlp": 0.01084495, + "balance_loss_clip": 1.02391624, + "balance_loss_mlp": 1.00443363, + "epoch": 0.9144471833102867, + "flos": 33323414019840.0, + "grad_norm": 1.6942100414411012, + "language_loss": 0.7354328, + "learning_rate": 7.619830938602013e-08, + "loss": 0.75744164, + "num_input_tokens_seen": 164472755, + "step": 7605, + "time_per_iteration": 2.8637256622314453 + }, + { + "auxiliary_loss_clip": 0.01126975, + "auxiliary_loss_mlp": 0.01084017, + "balance_loss_clip": 1.02514637, + "balance_loss_mlp": 1.0039556, + "epoch": 0.9145674262009259, + "flos": 21068970428160.0, + "grad_norm": 1.8463976894503018, + "language_loss": 0.82407349, + "learning_rate": 7.598548559689777e-08, + "loss": 0.84618342, + "num_input_tokens_seen": 164491155, + "step": 7606, + "time_per_iteration": 2.7067949771881104 + }, + { + "auxiliary_loss_clip": 0.01106746, + "auxiliary_loss_mlp": 0.0108431, + "balance_loss_clip": 1.0234412, + "balance_loss_mlp": 1.00410557, + "epoch": 0.914687669091565, + "flos": 16800269212800.0, + "grad_norm": 2.007817885017254, + "language_loss": 0.81253636, + "learning_rate": 7.577295367924751e-08, + "loss": 0.83444691, + "num_input_tokens_seen": 164507555, + "step": 7607, + "time_per_iteration": 2.7119832038879395 + }, + { + "auxiliary_loss_clip": 0.01117534, + "auxiliary_loss_mlp": 0.01084774, + "balance_loss_clip": 1.02532804, + "balance_loss_mlp": 1.00466514, + "epoch": 0.914807911982204, + "flos": 25773627012480.0, + "grad_norm": 1.583283274697945, + "language_loss": 0.82064748, + "learning_rate": 7.556071366531002e-08, + "loss": 0.84267056, + "num_input_tokens_seen": 164528525, + "step": 7608, + "time_per_iteration": 2.793396234512329 + }, + { + "auxiliary_loss_clip": 0.01123963, + "auxiliary_loss_mlp": 0.01084575, + "balance_loss_clip": 1.02465463, + "balance_loss_mlp": 1.00432312, + "epoch": 0.9149281548728432, + "flos": 19208043636480.0, + "grad_norm": 2.8444235924935106, + "language_loss": 0.78893268, + "learning_rate": 7.53487655872822e-08, + "loss": 0.81101799, + "num_input_tokens_seen": 164547695, + "step": 7609, + "time_per_iteration": 2.6804606914520264 + }, + { + "auxiliary_loss_clip": 0.01097175, + "auxiliary_loss_mlp": 0.01084344, + "balance_loss_clip": 1.02194226, + "balance_loss_mlp": 1.00433075, + "epoch": 0.9150483977634822, + "flos": 26870554500480.0, + "grad_norm": 1.7031671137584639, + "language_loss": 0.73949772, + "learning_rate": 7.513710947731656e-08, + "loss": 0.76131296, + "num_input_tokens_seen": 164568905, + "step": 7610, + "time_per_iteration": 2.9253463745117188 + }, + { + "auxiliary_loss_clip": 0.01109708, + "auxiliary_loss_mlp": 0.01084481, + "balance_loss_clip": 1.02351403, + "balance_loss_mlp": 1.00437284, + "epoch": 0.9151686406541213, + "flos": 21908956953600.0, + "grad_norm": 1.9015373255692867, + "language_loss": 0.85200012, + "learning_rate": 7.492574536752095e-08, + "loss": 0.87394202, + "num_input_tokens_seen": 164588895, + "step": 7611, + "time_per_iteration": 2.692918300628662 + }, + { + "auxiliary_loss_clip": 0.01124131, + "auxiliary_loss_mlp": 0.01083883, + "balance_loss_clip": 1.02506769, + "balance_loss_mlp": 1.00382173, + "epoch": 0.9152888835447605, + "flos": 27308556944640.0, + "grad_norm": 1.7289940358334153, + "language_loss": 0.78040433, + "learning_rate": 7.471467328995907e-08, + "loss": 0.80248451, + "num_input_tokens_seen": 164607705, + "step": 7612, + "time_per_iteration": 2.758660078048706 + }, + { + "auxiliary_loss_clip": 0.01054428, + "auxiliary_loss_mlp": 0.0108347, + "balance_loss_clip": 1.02243328, + "balance_loss_mlp": 1.00345659, + "epoch": 0.9154091264353995, + "flos": 13370728510080.0, + "grad_norm": 2.469333695530553, + "language_loss": 0.60321385, + "learning_rate": 7.450389327665018e-08, + "loss": 0.62459278, + "num_input_tokens_seen": 164625540, + "step": 7613, + "time_per_iteration": 2.919658899307251 + }, + { + "auxiliary_loss_clip": 0.01101284, + "auxiliary_loss_mlp": 0.01084776, + "balance_loss_clip": 1.02097571, + "balance_loss_mlp": 1.00461948, + "epoch": 0.9155293693260386, + "flos": 20193037367040.0, + "grad_norm": 2.678719645704792, + "language_loss": 0.66975427, + "learning_rate": 7.429340535957029e-08, + "loss": 0.69161481, + "num_input_tokens_seen": 164640735, + "step": 7614, + "time_per_iteration": 2.7819032669067383 + }, + { + "auxiliary_loss_clip": 0.0111865, + "auxiliary_loss_mlp": 0.01084042, + "balance_loss_clip": 1.02641058, + "balance_loss_mlp": 1.00388539, + "epoch": 0.9156496122166777, + "flos": 19354990176000.0, + "grad_norm": 2.206620838606617, + "language_loss": 0.70698476, + "learning_rate": 7.40832095706494e-08, + "loss": 0.72901177, + "num_input_tokens_seen": 164657430, + "step": 7615, + "time_per_iteration": 3.5921919345855713 + }, + { + "auxiliary_loss_clip": 0.01108226, + "auxiliary_loss_mlp": 0.01083785, + "balance_loss_clip": 1.02441549, + "balance_loss_mlp": 1.0037241, + "epoch": 0.9157698551073168, + "flos": 21107287261440.0, + "grad_norm": 1.7044785454081937, + "language_loss": 0.80124366, + "learning_rate": 7.387330594177443e-08, + "loss": 0.82316375, + "num_input_tokens_seen": 164679505, + "step": 7616, + "time_per_iteration": 2.8105812072753906 + }, + { + "auxiliary_loss_clip": 0.01107908, + "auxiliary_loss_mlp": 0.01083975, + "balance_loss_clip": 1.02414334, + "balance_loss_mlp": 1.00381875, + "epoch": 0.9158900979979558, + "flos": 25193167228800.0, + "grad_norm": 1.6391307045746437, + "language_loss": 0.79066265, + "learning_rate": 7.366369450478749e-08, + "loss": 0.81258154, + "num_input_tokens_seen": 164700615, + "step": 7617, + "time_per_iteration": 3.815903902053833 + }, + { + "auxiliary_loss_clip": 0.01105354, + "auxiliary_loss_mlp": 0.01083974, + "balance_loss_clip": 1.02252924, + "balance_loss_mlp": 1.00396109, + "epoch": 0.916010340888595, + "flos": 30146648302080.0, + "grad_norm": 1.60844759692301, + "language_loss": 0.66329819, + "learning_rate": 7.345437529148646e-08, + "loss": 0.68519145, + "num_input_tokens_seen": 164719625, + "step": 7618, + "time_per_iteration": 2.8326408863067627 + }, + { + "auxiliary_loss_clip": 0.0110177, + "auxiliary_loss_mlp": 0.01083031, + "balance_loss_clip": 1.02407527, + "balance_loss_mlp": 1.00292194, + "epoch": 0.9161305837792341, + "flos": 17091827907840.0, + "grad_norm": 2.1666168227284794, + "language_loss": 0.73050785, + "learning_rate": 7.324534833362483e-08, + "loss": 0.75235587, + "num_input_tokens_seen": 164737200, + "step": 7619, + "time_per_iteration": 3.6356968879699707 + }, + { + "auxiliary_loss_clip": 0.01113563, + "auxiliary_loss_mlp": 0.01084395, + "balance_loss_clip": 1.02246058, + "balance_loss_mlp": 1.00433373, + "epoch": 0.9162508266698731, + "flos": 22893699288960.0, + "grad_norm": 1.729602807952115, + "language_loss": 0.68680024, + "learning_rate": 7.303661366291192e-08, + "loss": 0.70877987, + "num_input_tokens_seen": 164757870, + "step": 7620, + "time_per_iteration": 3.73970627784729 + }, + { + "auxiliary_loss_clip": 0.01100902, + "auxiliary_loss_mlp": 0.01084727, + "balance_loss_clip": 1.02541828, + "balance_loss_mlp": 1.00466609, + "epoch": 0.9163710695605123, + "flos": 19974808287360.0, + "grad_norm": 1.9243369195191349, + "language_loss": 0.81554902, + "learning_rate": 7.28281713110126e-08, + "loss": 0.83740532, + "num_input_tokens_seen": 164775945, + "step": 7621, + "time_per_iteration": 2.811737298965454 + }, + { + "auxiliary_loss_clip": 0.01111575, + "auxiliary_loss_mlp": 0.01084456, + "balance_loss_clip": 1.02139378, + "balance_loss_mlp": 1.00439548, + "epoch": 0.9164913124511513, + "flos": 22783812606720.0, + "grad_norm": 2.054566647629135, + "language_loss": 0.76859558, + "learning_rate": 7.262002130954759e-08, + "loss": 0.79055589, + "num_input_tokens_seen": 164794400, + "step": 7622, + "time_per_iteration": 2.725728750228882 + }, + { + "auxiliary_loss_clip": 0.0110117, + "auxiliary_loss_mlp": 0.01084637, + "balance_loss_clip": 1.02508903, + "balance_loss_mlp": 1.00452828, + "epoch": 0.9166115553417904, + "flos": 24900854348160.0, + "grad_norm": 2.3096391692346647, + "language_loss": 0.7923491, + "learning_rate": 7.241216369009296e-08, + "loss": 0.81420714, + "num_input_tokens_seen": 164814585, + "step": 7623, + "time_per_iteration": 2.8586416244506836 + }, + { + "auxiliary_loss_clip": 0.01134331, + "auxiliary_loss_mlp": 0.01084352, + "balance_loss_clip": 1.02475095, + "balance_loss_mlp": 1.00424337, + "epoch": 0.9167317982324296, + "flos": 25702919089920.0, + "grad_norm": 3.1344306641417417, + "language_loss": 0.66277909, + "learning_rate": 7.220459848418037e-08, + "loss": 0.68496597, + "num_input_tokens_seen": 164834660, + "step": 7624, + "time_per_iteration": 2.6431775093078613 + }, + { + "auxiliary_loss_clip": 0.01135928, + "auxiliary_loss_mlp": 0.01084717, + "balance_loss_clip": 1.02661407, + "balance_loss_mlp": 1.00456047, + "epoch": 0.9168520411230686, + "flos": 15632813370240.0, + "grad_norm": 1.579122436983363, + "language_loss": 0.80028725, + "learning_rate": 7.199732572329708e-08, + "loss": 0.82249367, + "num_input_tokens_seen": 164852560, + "step": 7625, + "time_per_iteration": 2.64607310295105 + }, + { + "auxiliary_loss_clip": 0.01094494, + "auxiliary_loss_mlp": 0.01084374, + "balance_loss_clip": 1.02582991, + "balance_loss_mlp": 1.00421798, + "epoch": 0.9169722840137077, + "flos": 30258151096320.0, + "grad_norm": 2.02336049830591, + "language_loss": 0.75504082, + "learning_rate": 7.179034543888684e-08, + "loss": 0.77682948, + "num_input_tokens_seen": 164872065, + "step": 7626, + "time_per_iteration": 2.7697043418884277 + }, + { + "auxiliary_loss_clip": 0.0112584, + "auxiliary_loss_mlp": 0.01083709, + "balance_loss_clip": 1.0250833, + "balance_loss_mlp": 1.0036006, + "epoch": 0.9170925269043467, + "flos": 22491643380480.0, + "grad_norm": 2.026914350065851, + "language_loss": 0.77555561, + "learning_rate": 7.158365766234808e-08, + "loss": 0.79765117, + "num_input_tokens_seen": 164890915, + "step": 7627, + "time_per_iteration": 2.668701648712158 + }, + { + "auxiliary_loss_clip": 0.01106856, + "auxiliary_loss_mlp": 0.01084213, + "balance_loss_clip": 1.02251065, + "balance_loss_mlp": 1.00400865, + "epoch": 0.9172127697949859, + "flos": 22893914770560.0, + "grad_norm": 1.877450650686737, + "language_loss": 0.72360283, + "learning_rate": 7.137726242503527e-08, + "loss": 0.7455135, + "num_input_tokens_seen": 164909835, + "step": 7628, + "time_per_iteration": 2.758106231689453 + }, + { + "auxiliary_loss_clip": 0.01123937, + "auxiliary_loss_mlp": 0.00872936, + "balance_loss_clip": 1.02423716, + "balance_loss_mlp": 1.0001055, + "epoch": 0.917333012685625, + "flos": 17451867882240.0, + "grad_norm": 2.333842782807697, + "language_loss": 0.77898824, + "learning_rate": 7.11711597582585e-08, + "loss": 0.79895699, + "num_input_tokens_seen": 164927195, + "step": 7629, + "time_per_iteration": 2.649836778640747 + }, + { + "auxiliary_loss_clip": 0.01109887, + "auxiliary_loss_mlp": 0.01083305, + "balance_loss_clip": 1.02490127, + "balance_loss_mlp": 1.003196, + "epoch": 0.917453255576264, + "flos": 14318949692160.0, + "grad_norm": 1.5759091732378063, + "language_loss": 0.7998035, + "learning_rate": 7.096534969328271e-08, + "loss": 0.82173544, + "num_input_tokens_seen": 164944640, + "step": 7630, + "time_per_iteration": 2.7939627170562744 + }, + { + "auxiliary_loss_clip": 0.0111647, + "auxiliary_loss_mlp": 0.01084059, + "balance_loss_clip": 1.02355719, + "balance_loss_mlp": 1.00404584, + "epoch": 0.9175734984669032, + "flos": 20741177888640.0, + "grad_norm": 1.8935045153647372, + "language_loss": 0.84308565, + "learning_rate": 7.075983226132987e-08, + "loss": 0.86509097, + "num_input_tokens_seen": 164963570, + "step": 7631, + "time_per_iteration": 2.7160744667053223 + }, + { + "auxiliary_loss_clip": 0.01115083, + "auxiliary_loss_mlp": 0.00872957, + "balance_loss_clip": 1.02331877, + "balance_loss_mlp": 1.00006461, + "epoch": 0.9176937413575422, + "flos": 14830497233280.0, + "grad_norm": 11.141164921676856, + "language_loss": 0.79484004, + "learning_rate": 7.055460749357656e-08, + "loss": 0.81472051, + "num_input_tokens_seen": 164979850, + "step": 7632, + "time_per_iteration": 2.758995532989502 + }, + { + "auxiliary_loss_clip": 0.01113088, + "auxiliary_loss_mlp": 0.01084277, + "balance_loss_clip": 1.02338314, + "balance_loss_mlp": 1.00412083, + "epoch": 0.9178139842481813, + "flos": 18474603828480.0, + "grad_norm": 1.769813672553649, + "language_loss": 0.70217025, + "learning_rate": 7.034967542115521e-08, + "loss": 0.72414386, + "num_input_tokens_seen": 164998115, + "step": 7633, + "time_per_iteration": 2.663409948348999 + }, + { + "auxiliary_loss_clip": 0.01125775, + "auxiliary_loss_mlp": 0.00872938, + "balance_loss_clip": 1.0251174, + "balance_loss_mlp": 1.00011325, + "epoch": 0.9179342271388204, + "flos": 20047455544320.0, + "grad_norm": 1.8254552931934416, + "language_loss": 0.75629079, + "learning_rate": 7.014503607515388e-08, + "loss": 0.7762779, + "num_input_tokens_seen": 165017420, + "step": 7634, + "time_per_iteration": 2.6672956943511963 + }, + { + "auxiliary_loss_clip": 0.01115652, + "auxiliary_loss_mlp": 0.01084866, + "balance_loss_clip": 1.02513528, + "balance_loss_mlp": 1.00475705, + "epoch": 0.9180544700294595, + "flos": 24676232647680.0, + "grad_norm": 1.9699328631620698, + "language_loss": 0.67878151, + "learning_rate": 6.994068948661592e-08, + "loss": 0.70078665, + "num_input_tokens_seen": 165035575, + "step": 7635, + "time_per_iteration": 2.764066457748413 + }, + { + "auxiliary_loss_clip": 0.01124066, + "auxiliary_loss_mlp": 0.01085298, + "balance_loss_clip": 1.02433372, + "balance_loss_mlp": 1.00504649, + "epoch": 0.9181747129200986, + "flos": 16727478301440.0, + "grad_norm": 1.9855061830525214, + "language_loss": 0.76677072, + "learning_rate": 6.973663568654142e-08, + "loss": 0.78886437, + "num_input_tokens_seen": 165053280, + "step": 7636, + "time_per_iteration": 2.6734583377838135 + }, + { + "auxiliary_loss_clip": 0.01134795, + "auxiliary_loss_mlp": 0.0108473, + "balance_loss_clip": 1.02582216, + "balance_loss_mlp": 1.00447834, + "epoch": 0.9182949558107377, + "flos": 24271626873600.0, + "grad_norm": 2.0835933198523473, + "language_loss": 0.65662432, + "learning_rate": 6.953287470588386e-08, + "loss": 0.67881954, + "num_input_tokens_seen": 165071235, + "step": 7637, + "time_per_iteration": 2.6241607666015625 + }, + { + "auxiliary_loss_clip": 0.01125524, + "auxiliary_loss_mlp": 0.01084372, + "balance_loss_clip": 1.02433205, + "balance_loss_mlp": 1.00431108, + "epoch": 0.9184151987013768, + "flos": 22082117443200.0, + "grad_norm": 2.1045590500884446, + "language_loss": 0.85694575, + "learning_rate": 6.932940657555452e-08, + "loss": 0.87904477, + "num_input_tokens_seen": 165087365, + "step": 7638, + "time_per_iteration": 2.6689436435699463 + }, + { + "auxiliary_loss_clip": 0.01134619, + "auxiliary_loss_mlp": 0.01083625, + "balance_loss_clip": 1.02581835, + "balance_loss_mlp": 1.0035646, + "epoch": 0.9185354415920158, + "flos": 32166732257280.0, + "grad_norm": 1.8091581708543822, + "language_loss": 0.76101589, + "learning_rate": 6.912623132641938e-08, + "loss": 0.78319836, + "num_input_tokens_seen": 165112455, + "step": 7639, + "time_per_iteration": 2.7284727096557617 + }, + { + "auxiliary_loss_clip": 0.01111032, + "auxiliary_loss_mlp": 0.01084854, + "balance_loss_clip": 1.02451479, + "balance_loss_mlp": 1.00469744, + "epoch": 0.918655684482655, + "flos": 20997831542400.0, + "grad_norm": 1.7643524670079431, + "language_loss": 0.77096069, + "learning_rate": 6.892334898929952e-08, + "loss": 0.79291952, + "num_input_tokens_seen": 165132700, + "step": 7640, + "time_per_iteration": 3.5823371410369873 + }, + { + "auxiliary_loss_clip": 0.0112543, + "auxiliary_loss_mlp": 0.01084178, + "balance_loss_clip": 1.02463174, + "balance_loss_mlp": 1.00406885, + "epoch": 0.918775927373294, + "flos": 15560704817280.0, + "grad_norm": 1.881798346797688, + "language_loss": 0.84637153, + "learning_rate": 6.872075959497236e-08, + "loss": 0.86846757, + "num_input_tokens_seen": 165151475, + "step": 7641, + "time_per_iteration": 2.6323788166046143 + }, + { + "auxiliary_loss_clip": 0.01108826, + "auxiliary_loss_mlp": 0.0108298, + "balance_loss_clip": 1.0245657, + "balance_loss_mlp": 1.00296617, + "epoch": 0.9188961702639331, + "flos": 29934057657600.0, + "grad_norm": 1.6771155606760848, + "language_loss": 0.83138531, + "learning_rate": 6.85184631741702e-08, + "loss": 0.85330337, + "num_input_tokens_seen": 165172040, + "step": 7642, + "time_per_iteration": 3.664416551589966 + }, + { + "auxiliary_loss_clip": 0.01123756, + "auxiliary_loss_mlp": 0.01084227, + "balance_loss_clip": 1.02403593, + "balance_loss_mlp": 1.004071, + "epoch": 0.9190164131545723, + "flos": 20701244943360.0, + "grad_norm": 1.7763104048459748, + "language_loss": 0.77460521, + "learning_rate": 6.831645975758161e-08, + "loss": 0.79668504, + "num_input_tokens_seen": 165189980, + "step": 7643, + "time_per_iteration": 2.6774141788482666 + }, + { + "auxiliary_loss_clip": 0.01118131, + "auxiliary_loss_mlp": 0.01084617, + "balance_loss_clip": 1.0253222, + "balance_loss_mlp": 1.00436544, + "epoch": 0.9191366560452113, + "flos": 25629912696960.0, + "grad_norm": 1.7779469788513251, + "language_loss": 0.67030728, + "learning_rate": 6.811474937585026e-08, + "loss": 0.69233477, + "num_input_tokens_seen": 165209770, + "step": 7644, + "time_per_iteration": 2.7183210849761963 + }, + { + "auxiliary_loss_clip": 0.01107631, + "auxiliary_loss_mlp": 0.01084233, + "balance_loss_clip": 1.02516174, + "balance_loss_mlp": 1.00417233, + "epoch": 0.9192568989358504, + "flos": 21434325615360.0, + "grad_norm": 1.634824210464234, + "language_loss": 0.79012394, + "learning_rate": 6.79133320595755e-08, + "loss": 0.81204259, + "num_input_tokens_seen": 165229690, + "step": 7645, + "time_per_iteration": 4.595020771026611 + }, + { + "auxiliary_loss_clip": 0.01117081, + "auxiliary_loss_mlp": 0.01083581, + "balance_loss_clip": 1.02553225, + "balance_loss_mlp": 1.00347269, + "epoch": 0.9193771418264896, + "flos": 23185078416000.0, + "grad_norm": 1.737103901179292, + "language_loss": 0.75491428, + "learning_rate": 6.771220783931198e-08, + "loss": 0.77692091, + "num_input_tokens_seen": 165249850, + "step": 7646, + "time_per_iteration": 2.761813163757324 + }, + { + "auxiliary_loss_clip": 0.010439, + "auxiliary_loss_mlp": 0.00873104, + "balance_loss_clip": 1.01471436, + "balance_loss_mlp": 1.0013994, + "epoch": 0.9194973847171286, + "flos": 70582963184640.0, + "grad_norm": 0.8432590120931055, + "language_loss": 0.64658666, + "learning_rate": 6.751137674556994e-08, + "loss": 0.6657567, + "num_input_tokens_seen": 165310235, + "step": 7647, + "time_per_iteration": 3.7178030014038086 + }, + { + "auxiliary_loss_clip": 0.0112496, + "auxiliary_loss_mlp": 0.01083063, + "balance_loss_clip": 1.02374959, + "balance_loss_mlp": 1.00281096, + "epoch": 0.9196176276077677, + "flos": 14720682378240.0, + "grad_norm": 2.0676828570127066, + "language_loss": 0.77619278, + "learning_rate": 6.731083880881572e-08, + "loss": 0.79827297, + "num_input_tokens_seen": 165326455, + "step": 7648, + "time_per_iteration": 3.3635482788085938 + }, + { + "auxiliary_loss_clip": 0.01114749, + "auxiliary_loss_mlp": 0.01084, + "balance_loss_clip": 1.02376854, + "balance_loss_mlp": 1.00403428, + "epoch": 0.9197378704984068, + "flos": 23294893271040.0, + "grad_norm": 1.8682147678211156, + "language_loss": 0.81093144, + "learning_rate": 6.711059405947072e-08, + "loss": 0.832919, + "num_input_tokens_seen": 165344645, + "step": 7649, + "time_per_iteration": 2.768871307373047 + }, + { + "auxiliary_loss_clip": 0.01104037, + "auxiliary_loss_mlp": 0.01084442, + "balance_loss_clip": 1.02303922, + "balance_loss_mlp": 1.0044291, + "epoch": 0.9198581133890459, + "flos": 20302564913280.0, + "grad_norm": 1.9923379911669852, + "language_loss": 0.77033627, + "learning_rate": 6.691064252791156e-08, + "loss": 0.79222101, + "num_input_tokens_seen": 165364120, + "step": 7650, + "time_per_iteration": 2.802065849304199 + }, + { + "auxiliary_loss_clip": 0.01087246, + "auxiliary_loss_mlp": 0.01083817, + "balance_loss_clip": 1.020612, + "balance_loss_mlp": 1.00366032, + "epoch": 0.9199783562796849, + "flos": 17675663569920.0, + "grad_norm": 1.5100938822483285, + "language_loss": 0.77847344, + "learning_rate": 6.67109842444713e-08, + "loss": 0.80018407, + "num_input_tokens_seen": 165383050, + "step": 7651, + "time_per_iteration": 2.8614213466644287 + }, + { + "auxiliary_loss_clip": 0.01121243, + "auxiliary_loss_mlp": 0.00872922, + "balance_loss_clip": 1.02264929, + "balance_loss_mlp": 1.00006878, + "epoch": 0.9200985991703241, + "flos": 17676022705920.0, + "grad_norm": 1.7625365475507773, + "language_loss": 0.7668519, + "learning_rate": 6.651161923943704e-08, + "loss": 0.78679347, + "num_input_tokens_seen": 165400955, + "step": 7652, + "time_per_iteration": 2.731545925140381 + }, + { + "auxiliary_loss_clip": 0.01126621, + "auxiliary_loss_mlp": 0.01084345, + "balance_loss_clip": 1.0245769, + "balance_loss_mlp": 1.00414133, + "epoch": 0.9202188420609632, + "flos": 20996574566400.0, + "grad_norm": 1.6302766476518853, + "language_loss": 0.76682913, + "learning_rate": 6.631254754305326e-08, + "loss": 0.78893882, + "num_input_tokens_seen": 165420415, + "step": 7653, + "time_per_iteration": 2.7027299404144287 + }, + { + "auxiliary_loss_clip": 0.01134118, + "auxiliary_loss_mlp": 0.01084573, + "balance_loss_clip": 1.02461553, + "balance_loss_mlp": 1.00446415, + "epoch": 0.9203390849516022, + "flos": 13918222586880.0, + "grad_norm": 1.7381461181489015, + "language_loss": 0.78163946, + "learning_rate": 6.611376918551848e-08, + "loss": 0.80382633, + "num_input_tokens_seen": 165439200, + "step": 7654, + "time_per_iteration": 2.652444839477539 + }, + { + "auxiliary_loss_clip": 0.01092489, + "auxiliary_loss_mlp": 0.00872925, + "balance_loss_clip": 1.02424359, + "balance_loss_mlp": 1.00003934, + "epoch": 0.9204593278422414, + "flos": 21175912195200.0, + "grad_norm": 1.9759565635504395, + "language_loss": 0.79333448, + "learning_rate": 6.591528419698744e-08, + "loss": 0.81298864, + "num_input_tokens_seen": 165458985, + "step": 7655, + "time_per_iteration": 2.75502872467041 + }, + { + "auxiliary_loss_clip": 0.01117649, + "auxiliary_loss_mlp": 0.01084355, + "balance_loss_clip": 1.02423072, + "balance_loss_mlp": 1.00415051, + "epoch": 0.9205795707328804, + "flos": 14501375890560.0, + "grad_norm": 2.2865436634512584, + "language_loss": 0.83101928, + "learning_rate": 6.571709260756986e-08, + "loss": 0.85303932, + "num_input_tokens_seen": 165475630, + "step": 7656, + "time_per_iteration": 2.6895155906677246 + }, + { + "auxiliary_loss_clip": 0.0112627, + "auxiliary_loss_mlp": 0.01085053, + "balance_loss_clip": 1.02615011, + "balance_loss_mlp": 1.00494421, + "epoch": 0.9206998136235195, + "flos": 22417559579520.0, + "grad_norm": 1.978037045287657, + "language_loss": 0.76163834, + "learning_rate": 6.551919444733122e-08, + "loss": 0.78375149, + "num_input_tokens_seen": 165493445, + "step": 7657, + "time_per_iteration": 2.6324145793914795 + }, + { + "auxiliary_loss_clip": 0.01115845, + "auxiliary_loss_mlp": 0.01083177, + "balance_loss_clip": 1.02425647, + "balance_loss_mlp": 1.00311613, + "epoch": 0.9208200565141585, + "flos": 53358407544960.0, + "grad_norm": 1.804433756899718, + "language_loss": 0.66249532, + "learning_rate": 6.53215897462931e-08, + "loss": 0.68448555, + "num_input_tokens_seen": 165517200, + "step": 7658, + "time_per_iteration": 3.053415060043335 + }, + { + "auxiliary_loss_clip": 0.0112646, + "auxiliary_loss_mlp": 0.01083271, + "balance_loss_clip": 1.02521265, + "balance_loss_mlp": 1.00316238, + "epoch": 0.9209402994047977, + "flos": 30589139946240.0, + "grad_norm": 1.8285882643018139, + "language_loss": 0.74736476, + "learning_rate": 6.512427853443103e-08, + "loss": 0.76946211, + "num_input_tokens_seen": 165539280, + "step": 7659, + "time_per_iteration": 2.753861904144287 + }, + { + "auxiliary_loss_clip": 0.01125163, + "auxiliary_loss_mlp": 0.01084143, + "balance_loss_clip": 1.02410626, + "balance_loss_mlp": 1.0040822, + "epoch": 0.9210605422954368, + "flos": 29132711187840.0, + "grad_norm": 1.4682735107256593, + "language_loss": 0.75669038, + "learning_rate": 6.492726084167799e-08, + "loss": 0.77878344, + "num_input_tokens_seen": 165561395, + "step": 7660, + "time_per_iteration": 2.7212932109832764 + }, + { + "auxiliary_loss_clip": 0.0111257, + "auxiliary_loss_mlp": 0.01078922, + "balance_loss_clip": 1.01742887, + "balance_loss_mlp": 0.9999575, + "epoch": 0.9211807851860758, + "flos": 54853838472960.0, + "grad_norm": 0.7754584105226509, + "language_loss": 0.57559407, + "learning_rate": 6.473053669792072e-08, + "loss": 0.59750903, + "num_input_tokens_seen": 165616085, + "step": 7661, + "time_per_iteration": 3.057512044906616 + }, + { + "auxiliary_loss_clip": 0.01126735, + "auxiliary_loss_mlp": 0.01084141, + "balance_loss_clip": 1.02576089, + "balance_loss_mlp": 1.00403249, + "epoch": 0.921301028076715, + "flos": 19201974238080.0, + "grad_norm": 2.073856001131735, + "language_loss": 0.72943574, + "learning_rate": 6.453410613300248e-08, + "loss": 0.75154448, + "num_input_tokens_seen": 165634015, + "step": 7662, + "time_per_iteration": 2.6165761947631836 + }, + { + "auxiliary_loss_clip": 0.01090353, + "auxiliary_loss_mlp": 0.01083878, + "balance_loss_clip": 1.0234617, + "balance_loss_mlp": 1.00367403, + "epoch": 0.921421270967354, + "flos": 27526893765120.0, + "grad_norm": 1.656796934794067, + "language_loss": 0.58200765, + "learning_rate": 6.43379691767214e-08, + "loss": 0.60374999, + "num_input_tokens_seen": 165653220, + "step": 7663, + "time_per_iteration": 2.9232802391052246 + }, + { + "auxiliary_loss_clip": 0.01080612, + "auxiliary_loss_mlp": 0.01079099, + "balance_loss_clip": 1.01815891, + "balance_loss_mlp": 1.00013435, + "epoch": 0.9215415138579931, + "flos": 70209311955840.0, + "grad_norm": 0.7192851587867812, + "language_loss": 0.55180216, + "learning_rate": 6.414212585883105e-08, + "loss": 0.57339931, + "num_input_tokens_seen": 165715850, + "step": 7664, + "time_per_iteration": 3.4598958492279053 + }, + { + "auxiliary_loss_clip": 0.01116485, + "auxiliary_loss_mlp": 0.01083115, + "balance_loss_clip": 1.02477622, + "balance_loss_mlp": 1.00300658, + "epoch": 0.9216617567486323, + "flos": 35553107790720.0, + "grad_norm": 2.588808906599, + "language_loss": 0.70170981, + "learning_rate": 6.394657620904143e-08, + "loss": 0.72370577, + "num_input_tokens_seen": 165738960, + "step": 7665, + "time_per_iteration": 2.86556077003479 + }, + { + "auxiliary_loss_clip": 0.01136484, + "auxiliary_loss_mlp": 0.01084643, + "balance_loss_clip": 1.02664113, + "balance_loss_mlp": 1.00448632, + "epoch": 0.9217819996392713, + "flos": 29533330552320.0, + "grad_norm": 1.5344991341576866, + "language_loss": 0.71807694, + "learning_rate": 6.375132025701657e-08, + "loss": 0.74028814, + "num_input_tokens_seen": 165761260, + "step": 7666, + "time_per_iteration": 3.7979226112365723 + }, + { + "auxiliary_loss_clip": 0.01136263, + "auxiliary_loss_mlp": 0.01085301, + "balance_loss_clip": 1.02715874, + "balance_loss_mlp": 1.00504899, + "epoch": 0.9219022425299104, + "flos": 14574669592320.0, + "grad_norm": 2.248993626983932, + "language_loss": 0.69499612, + "learning_rate": 6.355635803237724e-08, + "loss": 0.71721172, + "num_input_tokens_seen": 165776960, + "step": 7667, + "time_per_iteration": 2.61736798286438 + }, + { + "auxiliary_loss_clip": 0.01127371, + "auxiliary_loss_mlp": 0.01084192, + "balance_loss_clip": 1.02636588, + "balance_loss_mlp": 1.00408387, + "epoch": 0.9220224854205495, + "flos": 18077503996800.0, + "grad_norm": 1.9496594653534303, + "language_loss": 0.79332376, + "learning_rate": 6.336168956469867e-08, + "loss": 0.81543934, + "num_input_tokens_seen": 165795435, + "step": 7668, + "time_per_iteration": 3.549175262451172 + }, + { + "auxiliary_loss_clip": 0.01117038, + "auxiliary_loss_mlp": 0.01084088, + "balance_loss_clip": 1.0257833, + "balance_loss_mlp": 1.00402665, + "epoch": 0.9221427283111886, + "flos": 24790464875520.0, + "grad_norm": 1.749169589444696, + "language_loss": 0.72073221, + "learning_rate": 6.316731488351168e-08, + "loss": 0.74274343, + "num_input_tokens_seen": 165816625, + "step": 7669, + "time_per_iteration": 2.728703498840332 + }, + { + "auxiliary_loss_clip": 0.01124626, + "auxiliary_loss_mlp": 0.01084058, + "balance_loss_clip": 1.02446246, + "balance_loss_mlp": 1.0039016, + "epoch": 0.9222629712018277, + "flos": 13845036625920.0, + "grad_norm": 1.6809845315047873, + "language_loss": 0.63224995, + "learning_rate": 6.297323401830334e-08, + "loss": 0.65433681, + "num_input_tokens_seen": 165835410, + "step": 7670, + "time_per_iteration": 3.60649037361145 + }, + { + "auxiliary_loss_clip": 0.01125073, + "auxiliary_loss_mlp": 0.01084146, + "balance_loss_clip": 1.02418303, + "balance_loss_mlp": 1.00398922, + "epoch": 0.9223832140924668, + "flos": 21616177196160.0, + "grad_norm": 2.626890901671402, + "language_loss": 0.69018137, + "learning_rate": 6.277944699851523e-08, + "loss": 0.71227354, + "num_input_tokens_seen": 165854930, + "step": 7671, + "time_per_iteration": 3.6382899284362793 + }, + { + "auxiliary_loss_clip": 0.01135228, + "auxiliary_loss_mlp": 0.01085186, + "balance_loss_clip": 1.02626836, + "balance_loss_mlp": 1.00503016, + "epoch": 0.9225034569831059, + "flos": 21142084561920.0, + "grad_norm": 1.675897904145384, + "language_loss": 0.73442358, + "learning_rate": 6.25859538535447e-08, + "loss": 0.75662768, + "num_input_tokens_seen": 165875725, + "step": 7672, + "time_per_iteration": 2.61702561378479 + }, + { + "auxiliary_loss_clip": 0.01114392, + "auxiliary_loss_mlp": 0.01083335, + "balance_loss_clip": 1.0237987, + "balance_loss_mlp": 1.00327408, + "epoch": 0.9226236998737449, + "flos": 12495046844160.0, + "grad_norm": 2.5066891330840377, + "language_loss": 0.78003347, + "learning_rate": 6.239275461274474e-08, + "loss": 0.80201077, + "num_input_tokens_seen": 165892100, + "step": 7673, + "time_per_iteration": 2.6866166591644287 + }, + { + "auxiliary_loss_clip": 0.01124225, + "auxiliary_loss_mlp": 0.01084763, + "balance_loss_clip": 1.02402532, + "balance_loss_mlp": 1.00465441, + "epoch": 0.9227439427643841, + "flos": 26214071581440.0, + "grad_norm": 1.9801295089737385, + "language_loss": 0.85835534, + "learning_rate": 6.219984930542299e-08, + "loss": 0.88044518, + "num_input_tokens_seen": 165912840, + "step": 7674, + "time_per_iteration": 2.8088877201080322 + }, + { + "auxiliary_loss_clip": 0.01127831, + "auxiliary_loss_mlp": 0.01083699, + "balance_loss_clip": 1.02641392, + "balance_loss_mlp": 1.00363851, + "epoch": 0.9228641856550232, + "flos": 17967581400960.0, + "grad_norm": 2.3837604934103207, + "language_loss": 0.76177037, + "learning_rate": 6.200723796084383e-08, + "loss": 0.78388566, + "num_input_tokens_seen": 165930935, + "step": 7675, + "time_per_iteration": 2.710423469543457 + }, + { + "auxiliary_loss_clip": 0.01088468, + "auxiliary_loss_mlp": 0.01078915, + "balance_loss_clip": 1.01714492, + "balance_loss_mlp": 0.99995059, + "epoch": 0.9229844285456622, + "flos": 70420609710720.0, + "grad_norm": 0.7650167696021243, + "language_loss": 0.63081551, + "learning_rate": 6.181492060822546e-08, + "loss": 0.65248936, + "num_input_tokens_seen": 165991110, + "step": 7676, + "time_per_iteration": 3.28501296043396 + }, + { + "auxiliary_loss_clip": 0.01099907, + "auxiliary_loss_mlp": 0.01084325, + "balance_loss_clip": 1.02360332, + "balance_loss_mlp": 1.00421643, + "epoch": 0.9231046714363014, + "flos": 17967832796160.0, + "grad_norm": 1.9965682999479302, + "language_loss": 0.81592667, + "learning_rate": 6.162289727674274e-08, + "loss": 0.83776891, + "num_input_tokens_seen": 166008790, + "step": 7677, + "time_per_iteration": 2.7790002822875977 + }, + { + "auxiliary_loss_clip": 0.01105713, + "auxiliary_loss_mlp": 0.01083808, + "balance_loss_clip": 1.02288294, + "balance_loss_mlp": 1.00384295, + "epoch": 0.9232249143269404, + "flos": 17858233422720.0, + "grad_norm": 2.4749920746684073, + "language_loss": 0.87890291, + "learning_rate": 6.143116799552527e-08, + "loss": 0.90079808, + "num_input_tokens_seen": 166025035, + "step": 7678, + "time_per_iteration": 2.7625181674957275 + }, + { + "auxiliary_loss_clip": 0.01127086, + "auxiliary_loss_mlp": 0.0108388, + "balance_loss_clip": 1.02649033, + "balance_loss_mlp": 1.00377154, + "epoch": 0.9233451572175795, + "flos": 23404384903680.0, + "grad_norm": 1.979788885003063, + "language_loss": 0.55599296, + "learning_rate": 6.123973279365802e-08, + "loss": 0.57810265, + "num_input_tokens_seen": 166044010, + "step": 7679, + "time_per_iteration": 2.7092125415802 + }, + { + "auxiliary_loss_clip": 0.01127622, + "auxiliary_loss_mlp": 0.010846, + "balance_loss_clip": 1.02680302, + "balance_loss_mlp": 1.00453866, + "epoch": 0.9234654001082186, + "flos": 17999326045440.0, + "grad_norm": 1.7898576370176094, + "language_loss": 0.77701962, + "learning_rate": 6.10485917001824e-08, + "loss": 0.79914188, + "num_input_tokens_seen": 166061865, + "step": 7680, + "time_per_iteration": 2.6745259761810303 + }, + { + "auxiliary_loss_clip": 0.01102404, + "auxiliary_loss_mlp": 0.01083406, + "balance_loss_clip": 1.02617168, + "balance_loss_mlp": 1.00339305, + "epoch": 0.9235856429988577, + "flos": 24750747411840.0, + "grad_norm": 1.7046088424780192, + "language_loss": 0.80809653, + "learning_rate": 6.085774474409322e-08, + "loss": 0.82995462, + "num_input_tokens_seen": 166082425, + "step": 7681, + "time_per_iteration": 2.7647392749786377 + }, + { + "auxiliary_loss_clip": 0.01114359, + "auxiliary_loss_mlp": 0.01084396, + "balance_loss_clip": 1.02409029, + "balance_loss_mlp": 1.00428772, + "epoch": 0.9237058858894968, + "flos": 14099894599680.0, + "grad_norm": 1.8369901065223846, + "language_loss": 0.70234168, + "learning_rate": 6.066719195434267e-08, + "loss": 0.72432923, + "num_input_tokens_seen": 166100225, + "step": 7682, + "time_per_iteration": 2.6925551891326904 + }, + { + "auxiliary_loss_clip": 0.01125618, + "auxiliary_loss_mlp": 0.01085305, + "balance_loss_clip": 1.02494955, + "balance_loss_mlp": 1.00510097, + "epoch": 0.9238261287801359, + "flos": 28694529175680.0, + "grad_norm": 1.9947850759374661, + "language_loss": 0.66656327, + "learning_rate": 6.047693335983717e-08, + "loss": 0.68867254, + "num_input_tokens_seen": 166122570, + "step": 7683, + "time_per_iteration": 2.7539749145507812 + }, + { + "auxiliary_loss_clip": 0.0112415, + "auxiliary_loss_mlp": 0.01083661, + "balance_loss_clip": 1.02328265, + "balance_loss_mlp": 1.00345707, + "epoch": 0.923946371670775, + "flos": 23111856541440.0, + "grad_norm": 5.049271819402442, + "language_loss": 0.82301867, + "learning_rate": 6.028696898943853e-08, + "loss": 0.84509677, + "num_input_tokens_seen": 166141630, + "step": 7684, + "time_per_iteration": 2.655804395675659 + }, + { + "auxiliary_loss_clip": 0.01117652, + "auxiliary_loss_mlp": 0.00872971, + "balance_loss_clip": 1.0244621, + "balance_loss_mlp": 1.00007939, + "epoch": 0.924066614561414, + "flos": 21867120587520.0, + "grad_norm": 1.9011665913194076, + "language_loss": 0.70540828, + "learning_rate": 6.00972988719648e-08, + "loss": 0.7253145, + "num_input_tokens_seen": 166159865, + "step": 7685, + "time_per_iteration": 2.727846622467041 + }, + { + "auxiliary_loss_clip": 0.01107848, + "auxiliary_loss_mlp": 0.00872974, + "balance_loss_clip": 1.02430391, + "balance_loss_mlp": 1.00009727, + "epoch": 0.9241868574520532, + "flos": 28511887495680.0, + "grad_norm": 6.0051488540931235, + "language_loss": 0.70632678, + "learning_rate": 5.990792303618807e-08, + "loss": 0.72613502, + "num_input_tokens_seen": 166179445, + "step": 7686, + "time_per_iteration": 2.793781042098999 + }, + { + "auxiliary_loss_clip": 0.01100159, + "auxiliary_loss_mlp": 0.01084878, + "balance_loss_clip": 1.01909637, + "balance_loss_mlp": 1.00486529, + "epoch": 0.9243071003426923, + "flos": 30518324282880.0, + "grad_norm": 1.6049952973348807, + "language_loss": 0.69555211, + "learning_rate": 5.971884151083695e-08, + "loss": 0.71740246, + "num_input_tokens_seen": 166201855, + "step": 7687, + "time_per_iteration": 2.8850700855255127 + }, + { + "auxiliary_loss_clip": 0.01120274, + "auxiliary_loss_mlp": 0.01082816, + "balance_loss_clip": 1.02775693, + "balance_loss_mlp": 1.00270724, + "epoch": 0.9244273432333313, + "flos": 28658331244800.0, + "grad_norm": 1.9172062845589444, + "language_loss": 0.74329603, + "learning_rate": 5.9530054324595124e-08, + "loss": 0.76532698, + "num_input_tokens_seen": 166221970, + "step": 7688, + "time_per_iteration": 2.759965181350708 + }, + { + "auxiliary_loss_clip": 0.01106658, + "auxiliary_loss_mlp": 0.00873014, + "balance_loss_clip": 1.02016282, + "balance_loss_mlp": 1.00133884, + "epoch": 0.9245475861239704, + "flos": 66230589237120.0, + "grad_norm": 0.7165473971330211, + "language_loss": 0.57569897, + "learning_rate": 5.934156150610103e-08, + "loss": 0.5954957, + "num_input_tokens_seen": 166279335, + "step": 7689, + "time_per_iteration": 3.2562143802642822 + }, + { + "auxiliary_loss_clip": 0.01116486, + "auxiliary_loss_mlp": 0.01084374, + "balance_loss_clip": 1.02406073, + "balance_loss_mlp": 1.00421762, + "epoch": 0.9246678290146095, + "flos": 24239918142720.0, + "grad_norm": 6.084629434546869, + "language_loss": 0.79408503, + "learning_rate": 5.915336308394914e-08, + "loss": 0.81609362, + "num_input_tokens_seen": 166298170, + "step": 7690, + "time_per_iteration": 2.7847039699554443 + }, + { + "auxiliary_loss_clip": 0.01123668, + "auxiliary_loss_mlp": 0.01083511, + "balance_loss_clip": 1.02423215, + "balance_loss_mlp": 1.00349784, + "epoch": 0.9247880719052486, + "flos": 18988808976000.0, + "grad_norm": 1.5109382653064989, + "language_loss": 0.76851112, + "learning_rate": 5.89654590866886e-08, + "loss": 0.7905829, + "num_input_tokens_seen": 166317670, + "step": 7691, + "time_per_iteration": 2.6792495250701904 + }, + { + "auxiliary_loss_clip": 0.01081732, + "auxiliary_loss_mlp": 0.01084639, + "balance_loss_clip": 1.01859593, + "balance_loss_mlp": 1.00448275, + "epoch": 0.9249083147958876, + "flos": 24024095274240.0, + "grad_norm": 2.208666180533446, + "language_loss": 0.88192964, + "learning_rate": 5.877784954282483e-08, + "loss": 0.90359342, + "num_input_tokens_seen": 166337010, + "step": 7692, + "time_per_iteration": 3.8866071701049805 + }, + { + "auxiliary_loss_clip": 0.01125395, + "auxiliary_loss_mlp": 0.01084368, + "balance_loss_clip": 1.02472043, + "balance_loss_mlp": 1.00425887, + "epoch": 0.9250285576865268, + "flos": 30773972355840.0, + "grad_norm": 1.9859140863721818, + "language_loss": 0.72397399, + "learning_rate": 5.8590534480817963e-08, + "loss": 0.74607158, + "num_input_tokens_seen": 166358735, + "step": 7693, + "time_per_iteration": 3.763605833053589 + }, + { + "auxiliary_loss_clip": 0.01136182, + "auxiliary_loss_mlp": 0.01083709, + "balance_loss_clip": 1.02692878, + "balance_loss_mlp": 1.00364852, + "epoch": 0.9251488005771659, + "flos": 10633581348480.0, + "grad_norm": 2.1157614716096402, + "language_loss": 0.72524953, + "learning_rate": 5.840351392908349e-08, + "loss": 0.74744844, + "num_input_tokens_seen": 166374455, + "step": 7694, + "time_per_iteration": 2.594393730163574 + }, + { + "auxiliary_loss_clip": 0.01120022, + "auxiliary_loss_mlp": 0.00872854, + "balance_loss_clip": 1.02660704, + "balance_loss_mlp": 1.00012445, + "epoch": 0.9252690434678049, + "flos": 23586416052480.0, + "grad_norm": 2.325990608287426, + "language_loss": 0.70416719, + "learning_rate": 5.821678791599205e-08, + "loss": 0.72409594, + "num_input_tokens_seen": 166393900, + "step": 7695, + "time_per_iteration": 3.6392416954040527 + }, + { + "auxiliary_loss_clip": 0.01114621, + "auxiliary_loss_mlp": 0.01083389, + "balance_loss_clip": 1.02362752, + "balance_loss_mlp": 1.00332832, + "epoch": 0.9253892863584441, + "flos": 21469158829440.0, + "grad_norm": 1.7719987130676906, + "language_loss": 0.80864275, + "learning_rate": 5.803035646986965e-08, + "loss": 0.83062285, + "num_input_tokens_seen": 166413235, + "step": 7696, + "time_per_iteration": 3.642897367477417 + }, + { + "auxiliary_loss_clip": 0.0113463, + "auxiliary_loss_mlp": 0.01084218, + "balance_loss_clip": 1.02532744, + "balance_loss_mlp": 1.00406146, + "epoch": 0.9255095292490831, + "flos": 17456680304640.0, + "grad_norm": 2.666192303405382, + "language_loss": 0.67757642, + "learning_rate": 5.7844219618998766e-08, + "loss": 0.69976491, + "num_input_tokens_seen": 166427560, + "step": 7697, + "time_per_iteration": 2.6389739513397217 + }, + { + "auxiliary_loss_clip": 0.01106917, + "auxiliary_loss_mlp": 0.01085691, + "balance_loss_clip": 1.02289128, + "balance_loss_mlp": 1.00558221, + "epoch": 0.9256297721397222, + "flos": 24750675584640.0, + "grad_norm": 1.6993218688036202, + "language_loss": 0.71723008, + "learning_rate": 5.765837739161505e-08, + "loss": 0.73915613, + "num_input_tokens_seen": 166446680, + "step": 7698, + "time_per_iteration": 2.793496608734131 + }, + { + "auxiliary_loss_clip": 0.01105495, + "auxiliary_loss_mlp": 0.01083444, + "balance_loss_clip": 1.02319813, + "balance_loss_mlp": 1.00343132, + "epoch": 0.9257500150303614, + "flos": 23112215677440.0, + "grad_norm": 2.0401259134027376, + "language_loss": 0.74302518, + "learning_rate": 5.7472829815911504e-08, + "loss": 0.76491457, + "num_input_tokens_seen": 166465505, + "step": 7699, + "time_per_iteration": 2.7564282417297363 + }, + { + "auxiliary_loss_clip": 0.01115391, + "auxiliary_loss_mlp": 0.01083814, + "balance_loss_clip": 1.02340579, + "balance_loss_mlp": 1.00370538, + "epoch": 0.9258702579210004, + "flos": 22564685687040.0, + "grad_norm": 2.3257905554483402, + "language_loss": 0.81519258, + "learning_rate": 5.7287576920035164e-08, + "loss": 0.83718467, + "num_input_tokens_seen": 166484520, + "step": 7700, + "time_per_iteration": 2.7290210723876953 + }, + { + "auxiliary_loss_clip": 0.01102722, + "auxiliary_loss_mlp": 0.01085006, + "balance_loss_clip": 1.02244461, + "balance_loss_mlp": 1.00489783, + "epoch": 0.9259905008116395, + "flos": 30004298703360.0, + "grad_norm": 1.822600674218562, + "language_loss": 0.76768005, + "learning_rate": 5.7102618732088435e-08, + "loss": 0.78955734, + "num_input_tokens_seen": 166503850, + "step": 7701, + "time_per_iteration": 2.7909629344940186 + }, + { + "auxiliary_loss_clip": 0.01102212, + "auxiliary_loss_mlp": 0.01084491, + "balance_loss_clip": 1.0258652, + "balance_loss_mlp": 1.00447798, + "epoch": 0.9261107437022786, + "flos": 24572128055040.0, + "grad_norm": 1.638543752156092, + "language_loss": 0.7457695, + "learning_rate": 5.6917955280130216e-08, + "loss": 0.76763654, + "num_input_tokens_seen": 166525330, + "step": 7702, + "time_per_iteration": 2.7576704025268555 + }, + { + "auxiliary_loss_clip": 0.01125854, + "auxiliary_loss_mlp": 0.01084214, + "balance_loss_clip": 1.02586973, + "balance_loss_mlp": 1.00400996, + "epoch": 0.9262309865929177, + "flos": 22018448586240.0, + "grad_norm": 2.32648238545649, + "language_loss": 0.72043931, + "learning_rate": 5.6733586592172755e-08, + "loss": 0.74254, + "num_input_tokens_seen": 166544825, + "step": 7703, + "time_per_iteration": 2.7090394496917725 + }, + { + "auxiliary_loss_clip": 0.01114081, + "auxiliary_loss_mlp": 0.00872778, + "balance_loss_clip": 1.02315736, + "balance_loss_mlp": 1.00010133, + "epoch": 0.9263512294835567, + "flos": 20339481116160.0, + "grad_norm": 1.7548940459771998, + "language_loss": 0.79863423, + "learning_rate": 5.6549512696185244e-08, + "loss": 0.81850284, + "num_input_tokens_seen": 166563325, + "step": 7704, + "time_per_iteration": 2.7406158447265625 + }, + { + "auxiliary_loss_clip": 0.01135156, + "auxiliary_loss_mlp": 0.01083996, + "balance_loss_clip": 1.02641857, + "balance_loss_mlp": 1.00398278, + "epoch": 0.9264714723741959, + "flos": 21215378263680.0, + "grad_norm": 1.6242683073926172, + "language_loss": 0.68348509, + "learning_rate": 5.636573362009156e-08, + "loss": 0.70567662, + "num_input_tokens_seen": 166583385, + "step": 7705, + "time_per_iteration": 2.675999402999878 + }, + { + "auxiliary_loss_clip": 0.01135057, + "auxiliary_loss_mlp": 0.01084064, + "balance_loss_clip": 1.02530646, + "balance_loss_mlp": 1.00395548, + "epoch": 0.926591715264835, + "flos": 18004964480640.0, + "grad_norm": 1.9332025025234985, + "language_loss": 0.77015114, + "learning_rate": 5.618224939177074e-08, + "loss": 0.79234231, + "num_input_tokens_seen": 166601290, + "step": 7706, + "time_per_iteration": 2.608963966369629 + }, + { + "auxiliary_loss_clip": 0.01115753, + "auxiliary_loss_mlp": 0.01083479, + "balance_loss_clip": 1.02413857, + "balance_loss_mlp": 1.00341833, + "epoch": 0.926711958155474, + "flos": 36167969825280.0, + "grad_norm": 1.7499063003791022, + "language_loss": 0.70294499, + "learning_rate": 5.599906003905719e-08, + "loss": 0.72493732, + "num_input_tokens_seen": 166623835, + "step": 7707, + "time_per_iteration": 2.8489668369293213 + }, + { + "auxiliary_loss_clip": 0.01119624, + "auxiliary_loss_mlp": 0.01085504, + "balance_loss_clip": 1.02162099, + "balance_loss_mlp": 1.00529981, + "epoch": 0.9268322010461132, + "flos": 21032736583680.0, + "grad_norm": 2.2006655097967873, + "language_loss": 0.81173611, + "learning_rate": 5.581616558974023e-08, + "loss": 0.83378744, + "num_input_tokens_seen": 166642400, + "step": 7708, + "time_per_iteration": 2.679408550262451 + }, + { + "auxiliary_loss_clip": 0.0112903, + "auxiliary_loss_mlp": 0.0087301, + "balance_loss_clip": 1.02739859, + "balance_loss_mlp": 1.00009394, + "epoch": 0.9269524439367522, + "flos": 22964838174720.0, + "grad_norm": 2.31526813946331, + "language_loss": 0.78884137, + "learning_rate": 5.5633566071565444e-08, + "loss": 0.80886173, + "num_input_tokens_seen": 166661640, + "step": 7709, + "time_per_iteration": 2.724083423614502 + }, + { + "auxiliary_loss_clip": 0.01087647, + "auxiliary_loss_mlp": 0.01084339, + "balance_loss_clip": 1.02224755, + "balance_loss_mlp": 1.00432622, + "epoch": 0.9270726868273913, + "flos": 41975551468800.0, + "grad_norm": 1.849206928135985, + "language_loss": 0.70736837, + "learning_rate": 5.5451261512232896e-08, + "loss": 0.72908825, + "num_input_tokens_seen": 166684320, + "step": 7710, + "time_per_iteration": 2.9706454277038574 + }, + { + "auxiliary_loss_clip": 0.01126528, + "auxiliary_loss_mlp": 0.01084618, + "balance_loss_clip": 1.02489305, + "balance_loss_mlp": 1.00450945, + "epoch": 0.9271929297180305, + "flos": 19791771557760.0, + "grad_norm": 1.9578311008713687, + "language_loss": 0.62257528, + "learning_rate": 5.5269251939397576e-08, + "loss": 0.64468676, + "num_input_tokens_seen": 166703835, + "step": 7711, + "time_per_iteration": 2.7219796180725098 + }, + { + "auxiliary_loss_clip": 0.01101045, + "auxiliary_loss_mlp": 0.01083203, + "balance_loss_clip": 1.02277935, + "balance_loss_mlp": 1.00309491, + "epoch": 0.9273131726086695, + "flos": 19968343839360.0, + "grad_norm": 2.0492413355539933, + "language_loss": 0.7654649, + "learning_rate": 5.508753738067073e-08, + "loss": 0.78730732, + "num_input_tokens_seen": 166723375, + "step": 7712, + "time_per_iteration": 2.791010856628418 + }, + { + "auxiliary_loss_clip": 0.01125618, + "auxiliary_loss_mlp": 0.01085416, + "balance_loss_clip": 1.02450633, + "balance_loss_mlp": 1.00525999, + "epoch": 0.9274334154993086, + "flos": 23258587599360.0, + "grad_norm": 1.7976853356356175, + "language_loss": 0.7871989, + "learning_rate": 5.4906117863617875e-08, + "loss": 0.80930924, + "num_input_tokens_seen": 166742760, + "step": 7713, + "time_per_iteration": 2.733142375946045 + }, + { + "auxiliary_loss_clip": 0.0110706, + "auxiliary_loss_mlp": 0.01084191, + "balance_loss_clip": 1.02299643, + "balance_loss_mlp": 1.00408196, + "epoch": 0.9275536583899477, + "flos": 31795343585280.0, + "grad_norm": 1.878099803402559, + "language_loss": 0.78401279, + "learning_rate": 5.4724993415760533e-08, + "loss": 0.80592525, + "num_input_tokens_seen": 166761115, + "step": 7714, + "time_per_iteration": 2.794428825378418 + }, + { + "auxiliary_loss_clip": 0.01109596, + "auxiliary_loss_mlp": 0.00872889, + "balance_loss_clip": 1.02459311, + "balance_loss_mlp": 1.00008893, + "epoch": 0.9276739012805868, + "flos": 18696998885760.0, + "grad_norm": 2.8376958426656254, + "language_loss": 0.74624026, + "learning_rate": 5.454416406457496e-08, + "loss": 0.76606512, + "num_input_tokens_seen": 166780210, + "step": 7715, + "time_per_iteration": 2.7525389194488525 + }, + { + "auxiliary_loss_clip": 0.01126057, + "auxiliary_loss_mlp": 0.01082892, + "balance_loss_clip": 1.02531648, + "balance_loss_mlp": 1.00292659, + "epoch": 0.9277941441712259, + "flos": 13879079740800.0, + "grad_norm": 3.2904730123846297, + "language_loss": 0.744789, + "learning_rate": 5.436362983749299e-08, + "loss": 0.76687849, + "num_input_tokens_seen": 166795380, + "step": 7716, + "time_per_iteration": 2.7161920070648193 + }, + { + "auxiliary_loss_clip": 0.01104837, + "auxiliary_loss_mlp": 0.01084782, + "balance_loss_clip": 1.02330899, + "balance_loss_mlp": 1.00472081, + "epoch": 0.927914387061865, + "flos": 23258659426560.0, + "grad_norm": 2.064448815131675, + "language_loss": 0.64476115, + "learning_rate": 5.418339076190137e-08, + "loss": 0.66665739, + "num_input_tokens_seen": 166814890, + "step": 7717, + "time_per_iteration": 2.748561143875122 + }, + { + "auxiliary_loss_clip": 0.01116866, + "auxiliary_loss_mlp": 0.01084507, + "balance_loss_clip": 1.02516603, + "balance_loss_mlp": 1.00439847, + "epoch": 0.9280346299525041, + "flos": 18073733068800.0, + "grad_norm": 2.1024308637962794, + "language_loss": 0.88477677, + "learning_rate": 5.400344686514202e-08, + "loss": 0.90679049, + "num_input_tokens_seen": 166832475, + "step": 7718, + "time_per_iteration": 4.424227476119995 + }, + { + "auxiliary_loss_clip": 0.0112411, + "auxiliary_loss_mlp": 0.01084072, + "balance_loss_clip": 1.02482235, + "balance_loss_mlp": 1.00396299, + "epoch": 0.9281548728431431, + "flos": 22342901160960.0, + "grad_norm": 1.962048849315437, + "language_loss": 0.66869128, + "learning_rate": 5.38237981745131e-08, + "loss": 0.69077307, + "num_input_tokens_seen": 166850590, + "step": 7719, + "time_per_iteration": 2.8100695610046387 + }, + { + "auxiliary_loss_clip": 0.01126523, + "auxiliary_loss_mlp": 0.00872805, + "balance_loss_clip": 1.02574229, + "balance_loss_mlp": 1.0001204, + "epoch": 0.9282751157337822, + "flos": 18843765857280.0, + "grad_norm": 1.6449373146490425, + "language_loss": 0.81297714, + "learning_rate": 5.364444471726592e-08, + "loss": 0.8329705, + "num_input_tokens_seen": 166869795, + "step": 7720, + "time_per_iteration": 3.595482110977173 + }, + { + "auxiliary_loss_clip": 0.01123146, + "auxiliary_loss_mlp": 0.01083798, + "balance_loss_clip": 1.02344179, + "balance_loss_mlp": 1.00373733, + "epoch": 0.9283953586244214, + "flos": 25556834476800.0, + "grad_norm": 2.1474804152865796, + "language_loss": 0.79819238, + "learning_rate": 5.346538652060939e-08, + "loss": 0.82026184, + "num_input_tokens_seen": 166891150, + "step": 7721, + "time_per_iteration": 2.7252755165100098 + }, + { + "auxiliary_loss_clip": 0.01117383, + "auxiliary_loss_mlp": 0.0108319, + "balance_loss_clip": 1.02579927, + "balance_loss_mlp": 1.00317645, + "epoch": 0.9285156015150604, + "flos": 18223480869120.0, + "grad_norm": 2.080822907699749, + "language_loss": 0.70069295, + "learning_rate": 5.3286623611705994e-08, + "loss": 0.72269869, + "num_input_tokens_seen": 166909195, + "step": 7722, + "time_per_iteration": 3.6287906169891357 + }, + { + "auxiliary_loss_clip": 0.01112678, + "auxiliary_loss_mlp": 0.01078926, + "balance_loss_clip": 1.01754403, + "balance_loss_mlp": 0.99996167, + "epoch": 0.9286358444056995, + "flos": 66400017690240.0, + "grad_norm": 0.8150680288607861, + "language_loss": 0.60639274, + "learning_rate": 5.3108156017673824e-08, + "loss": 0.62830877, + "num_input_tokens_seen": 166970955, + "step": 7723, + "time_per_iteration": 3.2668135166168213 + }, + { + "auxiliary_loss_clip": 0.01100379, + "auxiliary_loss_mlp": 0.01084196, + "balance_loss_clip": 1.02470875, + "balance_loss_mlp": 1.00403929, + "epoch": 0.9287560872963386, + "flos": 22345630594560.0, + "grad_norm": 1.555658957714496, + "language_loss": 0.71578074, + "learning_rate": 5.2929983765586775e-08, + "loss": 0.73762649, + "num_input_tokens_seen": 166989735, + "step": 7724, + "time_per_iteration": 2.7211053371429443 + }, + { + "auxiliary_loss_clip": 0.01135005, + "auxiliary_loss_mlp": 0.01084524, + "balance_loss_clip": 1.02587342, + "balance_loss_mlp": 1.00451076, + "epoch": 0.9288763301869777, + "flos": 25700225569920.0, + "grad_norm": 1.79687660436008, + "language_loss": 0.62497854, + "learning_rate": 5.275210688247278e-08, + "loss": 0.64717382, + "num_input_tokens_seen": 167010060, + "step": 7725, + "time_per_iteration": 2.683729410171509 + }, + { + "auxiliary_loss_clip": 0.01092383, + "auxiliary_loss_mlp": 0.01085022, + "balance_loss_clip": 1.01961613, + "balance_loss_mlp": 1.00496078, + "epoch": 0.9289965730776167, + "flos": 12312046028160.0, + "grad_norm": 1.931079139933308, + "language_loss": 0.85042834, + "learning_rate": 5.257452539531604e-08, + "loss": 0.8722024, + "num_input_tokens_seen": 167027130, + "step": 7726, + "time_per_iteration": 2.8134586811065674 + }, + { + "auxiliary_loss_clip": 0.01125728, + "auxiliary_loss_mlp": 0.01083551, + "balance_loss_clip": 1.02489662, + "balance_loss_mlp": 1.00339437, + "epoch": 0.9291168159682559, + "flos": 26685973486080.0, + "grad_norm": 1.5980331720478946, + "language_loss": 0.6866045, + "learning_rate": 5.2397239331055445e-08, + "loss": 0.70869732, + "num_input_tokens_seen": 167049130, + "step": 7727, + "time_per_iteration": 2.6829044818878174 + }, + { + "auxiliary_loss_clip": 0.01113053, + "auxiliary_loss_mlp": 0.0108484, + "balance_loss_clip": 1.0231452, + "balance_loss_mlp": 1.00473177, + "epoch": 0.929237058858895, + "flos": 14538256179840.0, + "grad_norm": 2.912140165875866, + "language_loss": 0.81215215, + "learning_rate": 5.2220248716585036e-08, + "loss": 0.83413112, + "num_input_tokens_seen": 167066810, + "step": 7728, + "time_per_iteration": 2.7229113578796387 + }, + { + "auxiliary_loss_clip": 0.01125724, + "auxiliary_loss_mlp": 0.01084756, + "balance_loss_clip": 1.02451897, + "balance_loss_mlp": 1.00459993, + "epoch": 0.929357301749534, + "flos": 23835456023040.0, + "grad_norm": 3.3478714547644954, + "language_loss": 0.75531948, + "learning_rate": 5.204355357875445e-08, + "loss": 0.77742422, + "num_input_tokens_seen": 167085155, + "step": 7729, + "time_per_iteration": 2.689549684524536 + }, + { + "auxiliary_loss_clip": 0.01119754, + "auxiliary_loss_mlp": 0.0108469, + "balance_loss_clip": 1.02642846, + "balance_loss_mlp": 1.00458169, + "epoch": 0.9294775446401732, + "flos": 12969319046400.0, + "grad_norm": 1.9814714213223659, + "language_loss": 0.69796169, + "learning_rate": 5.1867153944367584e-08, + "loss": 0.72000611, + "num_input_tokens_seen": 167101545, + "step": 7730, + "time_per_iteration": 2.866814374923706 + }, + { + "auxiliary_loss_clip": 0.01106777, + "auxiliary_loss_mlp": 0.01085355, + "balance_loss_clip": 1.02301598, + "balance_loss_mlp": 1.00524604, + "epoch": 0.9295977875308122, + "flos": 26211809024640.0, + "grad_norm": 1.536958140298251, + "language_loss": 0.73466194, + "learning_rate": 5.16910498401848e-08, + "loss": 0.75658321, + "num_input_tokens_seen": 167120995, + "step": 7731, + "time_per_iteration": 2.8023290634155273 + }, + { + "auxiliary_loss_clip": 0.01135148, + "auxiliary_loss_mlp": 0.01084839, + "balance_loss_clip": 1.02602851, + "balance_loss_mlp": 1.00482523, + "epoch": 0.9297180304214513, + "flos": 16472297105280.0, + "grad_norm": 2.095552998680794, + "language_loss": 0.83700538, + "learning_rate": 5.151524129292073e-08, + "loss": 0.85920531, + "num_input_tokens_seen": 167138890, + "step": 7732, + "time_per_iteration": 2.569974184036255 + }, + { + "auxiliary_loss_clip": 0.01123531, + "auxiliary_loss_mlp": 0.01083417, + "balance_loss_clip": 1.02384233, + "balance_loss_mlp": 1.00326061, + "epoch": 0.9298382733120905, + "flos": 24060436859520.0, + "grad_norm": 1.8791179482506883, + "language_loss": 0.66290849, + "learning_rate": 5.1339728329245155e-08, + "loss": 0.68497801, + "num_input_tokens_seen": 167159455, + "step": 7733, + "time_per_iteration": 2.736466646194458 + }, + { + "auxiliary_loss_clip": 0.01135795, + "auxiliary_loss_mlp": 0.01084709, + "balance_loss_clip": 1.02589595, + "balance_loss_mlp": 1.00455225, + "epoch": 0.9299585162027295, + "flos": 22127652910080.0, + "grad_norm": 1.9395631796336734, + "language_loss": 0.79335356, + "learning_rate": 5.116451097578367e-08, + "loss": 0.81555855, + "num_input_tokens_seen": 167178495, + "step": 7734, + "time_per_iteration": 2.6052000522613525 + }, + { + "auxiliary_loss_clip": 0.011083, + "auxiliary_loss_mlp": 0.01083927, + "balance_loss_clip": 1.02485394, + "balance_loss_mlp": 1.00391364, + "epoch": 0.9300787590933686, + "flos": 21471780522240.0, + "grad_norm": 1.9392501409473992, + "language_loss": 0.74214804, + "learning_rate": 5.0989589259115895e-08, + "loss": 0.76407033, + "num_input_tokens_seen": 167199380, + "step": 7735, + "time_per_iteration": 2.820751667022705 + }, + { + "auxiliary_loss_clip": 0.01125478, + "auxiliary_loss_mlp": 0.01084203, + "balance_loss_clip": 1.02405298, + "balance_loss_mlp": 1.00385594, + "epoch": 0.9301990019840077, + "flos": 17779588594560.0, + "grad_norm": 5.210684716105772, + "language_loss": 0.71336198, + "learning_rate": 5.081496320577816e-08, + "loss": 0.73545885, + "num_input_tokens_seen": 167216500, + "step": 7736, + "time_per_iteration": 2.59816837310791 + }, + { + "auxiliary_loss_clip": 0.01088431, + "auxiliary_loss_mlp": 0.01079455, + "balance_loss_clip": 1.01009154, + "balance_loss_mlp": 1.00049055, + "epoch": 0.9303192448746468, + "flos": 58896122307840.0, + "grad_norm": 0.9163011870130746, + "language_loss": 0.61263883, + "learning_rate": 5.0640632842260835e-08, + "loss": 0.63431764, + "num_input_tokens_seen": 167276760, + "step": 7737, + "time_per_iteration": 3.336621046066284 + }, + { + "auxiliary_loss_clip": 0.0109795, + "auxiliary_loss_mlp": 0.00872841, + "balance_loss_clip": 1.02191138, + "balance_loss_mlp": 1.00012517, + "epoch": 0.9304394877652858, + "flos": 57663522172800.0, + "grad_norm": 1.3998975008123506, + "language_loss": 0.72788119, + "learning_rate": 5.0466598195009426e-08, + "loss": 0.74758911, + "num_input_tokens_seen": 167303630, + "step": 7738, + "time_per_iteration": 3.107853889465332 + }, + { + "auxiliary_loss_clip": 0.01085097, + "auxiliary_loss_mlp": 0.01084153, + "balance_loss_clip": 1.02411711, + "balance_loss_mlp": 1.00409198, + "epoch": 0.930559730655925, + "flos": 20996143603200.0, + "grad_norm": 1.79590677203025, + "language_loss": 0.70394969, + "learning_rate": 5.0292859290425036e-08, + "loss": 0.72564226, + "num_input_tokens_seen": 167321500, + "step": 7739, + "time_per_iteration": 2.6952435970306396 + }, + { + "auxiliary_loss_clip": 0.01135138, + "auxiliary_loss_mlp": 0.01083266, + "balance_loss_clip": 1.02590632, + "balance_loss_mlp": 1.00330043, + "epoch": 0.9306799735465641, + "flos": 23258264376960.0, + "grad_norm": 1.8910166798016277, + "language_loss": 0.77484775, + "learning_rate": 5.011941615486348e-08, + "loss": 0.79703176, + "num_input_tokens_seen": 167340615, + "step": 7740, + "time_per_iteration": 2.680736780166626 + }, + { + "auxiliary_loss_clip": 0.01134482, + "auxiliary_loss_mlp": 0.01085307, + "balance_loss_clip": 1.02515173, + "balance_loss_mlp": 1.00519872, + "epoch": 0.9308002164372031, + "flos": 15231547560960.0, + "grad_norm": 1.771585690718401, + "language_loss": 0.845783, + "learning_rate": 4.994626881463659e-08, + "loss": 0.8679809, + "num_input_tokens_seen": 167356870, + "step": 7741, + "time_per_iteration": 2.602540969848633 + }, + { + "auxiliary_loss_clip": 0.01096866, + "auxiliary_loss_mlp": 0.01084038, + "balance_loss_clip": 1.02255583, + "balance_loss_mlp": 1.00388217, + "epoch": 0.9309204593278423, + "flos": 30847481539200.0, + "grad_norm": 6.686296071331167, + "language_loss": 0.70869201, + "learning_rate": 4.9773417296009814e-08, + "loss": 0.73050106, + "num_input_tokens_seen": 167378390, + "step": 7742, + "time_per_iteration": 2.84871244430542 + }, + { + "auxiliary_loss_clip": 0.01127448, + "auxiliary_loss_mlp": 0.01084593, + "balance_loss_clip": 1.02630281, + "balance_loss_mlp": 1.00453246, + "epoch": 0.9310407022184813, + "flos": 23037269950080.0, + "grad_norm": 1.5646210967151817, + "language_loss": 0.65621024, + "learning_rate": 4.960086162520527e-08, + "loss": 0.67833066, + "num_input_tokens_seen": 167398480, + "step": 7743, + "time_per_iteration": 2.6958165168762207 + }, + { + "auxiliary_loss_clip": 0.01081121, + "auxiliary_loss_mlp": 0.01083905, + "balance_loss_clip": 1.02261651, + "balance_loss_mlp": 1.00384378, + "epoch": 0.9311609451091204, + "flos": 22127976132480.0, + "grad_norm": 1.8886550782273863, + "language_loss": 0.8242197, + "learning_rate": 4.942860182839936e-08, + "loss": 0.84587002, + "num_input_tokens_seen": 167416825, + "step": 7744, + "time_per_iteration": 4.600063800811768 + }, + { + "auxiliary_loss_clip": 0.01116885, + "auxiliary_loss_mlp": 0.01085056, + "balance_loss_clip": 1.02452826, + "balance_loss_mlp": 1.00480461, + "epoch": 0.9312811879997596, + "flos": 21099206701440.0, + "grad_norm": 1.8380966395523537, + "language_loss": 0.79370761, + "learning_rate": 4.925663793172341e-08, + "loss": 0.81572706, + "num_input_tokens_seen": 167434785, + "step": 7745, + "time_per_iteration": 3.5779693126678467 + }, + { + "auxiliary_loss_clip": 0.01094546, + "auxiliary_loss_mlp": 0.00872996, + "balance_loss_clip": 1.01643193, + "balance_loss_mlp": 1.00132418, + "epoch": 0.9314014308903986, + "flos": 67148179096320.0, + "grad_norm": 0.7836777571676193, + "language_loss": 0.56521916, + "learning_rate": 4.908496996126477e-08, + "loss": 0.58489454, + "num_input_tokens_seen": 167498245, + "step": 7746, + "time_per_iteration": 3.340641975402832 + }, + { + "auxiliary_loss_clip": 0.01122966, + "auxiliary_loss_mlp": 0.01084069, + "balance_loss_clip": 1.02432013, + "balance_loss_mlp": 1.00391316, + "epoch": 0.9315216737810377, + "flos": 22565583527040.0, + "grad_norm": 1.4632223778162026, + "language_loss": 0.76181531, + "learning_rate": 4.89135979430646e-08, + "loss": 0.7838856, + "num_input_tokens_seen": 167518290, + "step": 7747, + "time_per_iteration": 3.6301989555358887 + }, + { + "auxiliary_loss_clip": 0.01135678, + "auxiliary_loss_mlp": 0.01084029, + "balance_loss_clip": 1.02667427, + "balance_loss_mlp": 1.00387239, + "epoch": 0.9316419166716768, + "flos": 23984054588160.0, + "grad_norm": 1.5749422311759154, + "language_loss": 0.85302913, + "learning_rate": 4.874252190312078e-08, + "loss": 0.87522626, + "num_input_tokens_seen": 167538675, + "step": 7748, + "time_per_iteration": 2.6866962909698486 + }, + { + "auxiliary_loss_clip": 0.01109642, + "auxiliary_loss_mlp": 0.01084536, + "balance_loss_clip": 1.02562404, + "balance_loss_mlp": 1.0043323, + "epoch": 0.9317621595623159, + "flos": 30230464688640.0, + "grad_norm": 1.7062586437047926, + "language_loss": 0.65071273, + "learning_rate": 4.857174186738477e-08, + "loss": 0.67265445, + "num_input_tokens_seen": 167562025, + "step": 7749, + "time_per_iteration": 2.755553722381592 + }, + { + "auxiliary_loss_clip": 0.01135931, + "auxiliary_loss_mlp": 0.01084409, + "balance_loss_clip": 1.02701724, + "balance_loss_mlp": 1.00425291, + "epoch": 0.931882402452955, + "flos": 15742735966080.0, + "grad_norm": 3.7111379779898903, + "language_loss": 0.72883976, + "learning_rate": 4.840125786176408e-08, + "loss": 0.7510432, + "num_input_tokens_seen": 167578230, + "step": 7750, + "time_per_iteration": 2.6173758506774902 + }, + { + "auxiliary_loss_clip": 0.01118577, + "auxiliary_loss_mlp": 0.01084685, + "balance_loss_clip": 1.02633882, + "balance_loss_mlp": 1.00452876, + "epoch": 0.932002645343594, + "flos": 28366521154560.0, + "grad_norm": 2.8168768454210236, + "language_loss": 0.77670348, + "learning_rate": 4.823106991212067e-08, + "loss": 0.79873616, + "num_input_tokens_seen": 167597470, + "step": 7751, + "time_per_iteration": 2.8074228763580322 + }, + { + "auxiliary_loss_clip": 0.0112638, + "auxiliary_loss_mlp": 0.01084169, + "balance_loss_clip": 1.02519917, + "balance_loss_mlp": 1.00410795, + "epoch": 0.9321228882342332, + "flos": 15341146934400.0, + "grad_norm": 1.8291435769696336, + "language_loss": 0.83282948, + "learning_rate": 4.806117804427212e-08, + "loss": 0.85493493, + "num_input_tokens_seen": 167615405, + "step": 7752, + "time_per_iteration": 2.628599166870117 + }, + { + "auxiliary_loss_clip": 0.0112707, + "auxiliary_loss_mlp": 0.01084229, + "balance_loss_clip": 1.02592731, + "balance_loss_mlp": 1.0040729, + "epoch": 0.9322431311248722, + "flos": 17895365107200.0, + "grad_norm": 1.9706119609730464, + "language_loss": 0.64173293, + "learning_rate": 4.7891582283990926e-08, + "loss": 0.66384596, + "num_input_tokens_seen": 167634130, + "step": 7753, + "time_per_iteration": 2.663400411605835 + }, + { + "auxiliary_loss_clip": 0.01091416, + "auxiliary_loss_mlp": 0.01084713, + "balance_loss_clip": 1.02429831, + "balance_loss_mlp": 1.00455666, + "epoch": 0.9323633740155113, + "flos": 24169713010560.0, + "grad_norm": 1.506041796284148, + "language_loss": 0.72661889, + "learning_rate": 4.772228265700473e-08, + "loss": 0.74838018, + "num_input_tokens_seen": 167654990, + "step": 7754, + "time_per_iteration": 2.750067710876465 + }, + { + "auxiliary_loss_clip": 0.01126361, + "auxiliary_loss_mlp": 0.01084436, + "balance_loss_clip": 1.02544868, + "balance_loss_mlp": 1.00423169, + "epoch": 0.9324836169061504, + "flos": 15043482927360.0, + "grad_norm": 2.6010550026705497, + "language_loss": 0.75716186, + "learning_rate": 4.75532791889961e-08, + "loss": 0.77926975, + "num_input_tokens_seen": 167671690, + "step": 7755, + "time_per_iteration": 2.6319758892059326 + }, + { + "auxiliary_loss_clip": 0.0112696, + "auxiliary_loss_mlp": 0.01084298, + "balance_loss_clip": 1.02516031, + "balance_loss_mlp": 1.00418937, + "epoch": 0.9326038597967895, + "flos": 18624890332800.0, + "grad_norm": 2.1077389657833363, + "language_loss": 0.65655082, + "learning_rate": 4.738457190560252e-08, + "loss": 0.67866343, + "num_input_tokens_seen": 167690800, + "step": 7756, + "time_per_iteration": 2.643235206604004 + }, + { + "auxiliary_loss_clip": 0.01095636, + "auxiliary_loss_mlp": 0.01084325, + "balance_loss_clip": 1.02298045, + "balance_loss_mlp": 1.00421619, + "epoch": 0.9327241026874286, + "flos": 18952646958720.0, + "grad_norm": 1.96745375414635, + "language_loss": 0.78691673, + "learning_rate": 4.721616083241664e-08, + "loss": 0.80871636, + "num_input_tokens_seen": 167709055, + "step": 7757, + "time_per_iteration": 2.839264154434204 + }, + { + "auxiliary_loss_clip": 0.01126127, + "auxiliary_loss_mlp": 0.01084412, + "balance_loss_clip": 1.02504945, + "balance_loss_mlp": 1.00430346, + "epoch": 0.9328443455780677, + "flos": 29570282668800.0, + "grad_norm": 1.718941803947228, + "language_loss": 0.77589607, + "learning_rate": 4.7048045994986684e-08, + "loss": 0.79800147, + "num_input_tokens_seen": 167729915, + "step": 7758, + "time_per_iteration": 2.74666166305542 + }, + { + "auxiliary_loss_clip": 0.01109434, + "auxiliary_loss_mlp": 0.01083594, + "balance_loss_clip": 1.02514958, + "balance_loss_mlp": 1.00343776, + "epoch": 0.9329645884687068, + "flos": 30081722469120.0, + "grad_norm": 1.9115213599813103, + "language_loss": 0.90947282, + "learning_rate": 4.688022741881559e-08, + "loss": 0.9314031, + "num_input_tokens_seen": 167750440, + "step": 7759, + "time_per_iteration": 2.7575247287750244 + }, + { + "auxiliary_loss_clip": 0.01123713, + "auxiliary_loss_mlp": 0.01083062, + "balance_loss_clip": 1.0239799, + "balance_loss_mlp": 1.00309658, + "epoch": 0.9330848313593458, + "flos": 21867982513920.0, + "grad_norm": 1.5399883221083541, + "language_loss": 0.75373554, + "learning_rate": 4.671270512936076e-08, + "loss": 0.77580327, + "num_input_tokens_seen": 167769600, + "step": 7760, + "time_per_iteration": 2.6828818321228027 + }, + { + "auxiliary_loss_clip": 0.01107216, + "auxiliary_loss_mlp": 0.01082653, + "balance_loss_clip": 1.02336264, + "balance_loss_mlp": 1.00263977, + "epoch": 0.933205074249985, + "flos": 22127221946880.0, + "grad_norm": 1.6933982055519872, + "language_loss": 0.82617307, + "learning_rate": 4.6545479152035884e-08, + "loss": 0.84807175, + "num_input_tokens_seen": 167788770, + "step": 7761, + "time_per_iteration": 2.8006346225738525 + }, + { + "auxiliary_loss_clip": 0.0112591, + "auxiliary_loss_mlp": 0.01083569, + "balance_loss_clip": 1.0251441, + "balance_loss_mlp": 1.00346041, + "epoch": 0.9333253171406241, + "flos": 15341254675200.0, + "grad_norm": 2.4216353212876096, + "language_loss": 0.76188672, + "learning_rate": 4.637854951220821e-08, + "loss": 0.78398156, + "num_input_tokens_seen": 167805555, + "step": 7762, + "time_per_iteration": 2.723196268081665 + }, + { + "auxiliary_loss_clip": 0.0110649, + "auxiliary_loss_mlp": 0.0108319, + "balance_loss_clip": 1.02314401, + "balance_loss_mlp": 1.00308144, + "epoch": 0.9334455600312631, + "flos": 15706142985600.0, + "grad_norm": 1.9145300570136385, + "language_loss": 0.74676961, + "learning_rate": 4.621191623520171e-08, + "loss": 0.76866645, + "num_input_tokens_seen": 167823985, + "step": 7763, + "time_per_iteration": 2.758314847946167 + }, + { + "auxiliary_loss_clip": 0.0107404, + "auxiliary_loss_mlp": 0.01084103, + "balance_loss_clip": 1.02331591, + "balance_loss_mlp": 1.00399435, + "epoch": 0.9335658029219023, + "flos": 22163563532160.0, + "grad_norm": 2.12161698563531, + "language_loss": 0.84504735, + "learning_rate": 4.604557934629372e-08, + "loss": 0.86662877, + "num_input_tokens_seen": 167843060, + "step": 7764, + "time_per_iteration": 2.8918144702911377 + }, + { + "auxiliary_loss_clip": 0.01113283, + "auxiliary_loss_mlp": 0.0108529, + "balance_loss_clip": 1.02316463, + "balance_loss_mlp": 1.005229, + "epoch": 0.9336860458125413, + "flos": 20266833859200.0, + "grad_norm": 1.5970119833847485, + "language_loss": 0.80436623, + "learning_rate": 4.587953887071805e-08, + "loss": 0.82635194, + "num_input_tokens_seen": 167862880, + "step": 7765, + "time_per_iteration": 2.7185611724853516 + }, + { + "auxiliary_loss_clip": 0.01114889, + "auxiliary_loss_mlp": 0.01084204, + "balance_loss_clip": 1.02328956, + "balance_loss_mlp": 1.00409532, + "epoch": 0.9338062887031804, + "flos": 20919689504640.0, + "grad_norm": 1.7012777341564633, + "language_loss": 0.85925841, + "learning_rate": 4.5713794833662554e-08, + "loss": 0.88124931, + "num_input_tokens_seen": 167882095, + "step": 7766, + "time_per_iteration": 2.7902987003326416 + }, + { + "auxiliary_loss_clip": 0.01134945, + "auxiliary_loss_mlp": 0.01083829, + "balance_loss_clip": 1.02596998, + "balance_loss_mlp": 1.00367236, + "epoch": 0.9339265315938196, + "flos": 23221635482880.0, + "grad_norm": 1.6549687158364106, + "language_loss": 0.63128024, + "learning_rate": 4.5548347260270236e-08, + "loss": 0.65346795, + "num_input_tokens_seen": 167901385, + "step": 7767, + "time_per_iteration": 2.6291451454162598 + }, + { + "auxiliary_loss_clip": 0.01105443, + "auxiliary_loss_mlp": 0.01084136, + "balance_loss_clip": 1.02275968, + "balance_loss_mlp": 1.00402713, + "epoch": 0.9340467744844586, + "flos": 22820261932800.0, + "grad_norm": 1.8830197162032178, + "language_loss": 0.69423401, + "learning_rate": 4.538319617564012e-08, + "loss": 0.71612978, + "num_input_tokens_seen": 167920405, + "step": 7768, + "time_per_iteration": 2.7903339862823486 + }, + { + "auxiliary_loss_clip": 0.01116249, + "auxiliary_loss_mlp": 0.01083382, + "balance_loss_clip": 1.0243237, + "balance_loss_mlp": 1.00327349, + "epoch": 0.9341670173750977, + "flos": 23660428026240.0, + "grad_norm": 2.2153587038702227, + "language_loss": 0.74651301, + "learning_rate": 4.521834160482485e-08, + "loss": 0.76850927, + "num_input_tokens_seen": 167939145, + "step": 7769, + "time_per_iteration": 3.62849760055542 + }, + { + "auxiliary_loss_clip": 0.01124946, + "auxiliary_loss_mlp": 0.01083872, + "balance_loss_clip": 1.02405643, + "balance_loss_mlp": 1.00376391, + "epoch": 0.9342872602657368, + "flos": 24824256595200.0, + "grad_norm": 1.541845926297955, + "language_loss": 0.82399333, + "learning_rate": 4.5053783572832846e-08, + "loss": 0.8460815, + "num_input_tokens_seen": 167959325, + "step": 7770, + "time_per_iteration": 4.484643936157227 + }, + { + "auxiliary_loss_clip": 0.01124793, + "auxiliary_loss_mlp": 0.01083706, + "balance_loss_clip": 1.02487302, + "balance_loss_mlp": 1.00364494, + "epoch": 0.9344075031563759, + "flos": 25771831332480.0, + "grad_norm": 1.8711221245302134, + "language_loss": 0.76098382, + "learning_rate": 4.488952210462771e-08, + "loss": 0.78306878, + "num_input_tokens_seen": 167979530, + "step": 7771, + "time_per_iteration": 2.679412841796875 + }, + { + "auxiliary_loss_clip": 0.01135568, + "auxiliary_loss_mlp": 0.01084326, + "balance_loss_clip": 1.02669132, + "balance_loss_mlp": 1.00426507, + "epoch": 0.9345277460470149, + "flos": 25551303782400.0, + "grad_norm": 1.7961171148111794, + "language_loss": 0.85726225, + "learning_rate": 4.4725557225127495e-08, + "loss": 0.87946117, + "num_input_tokens_seen": 167997870, + "step": 7772, + "time_per_iteration": 3.661393165588379 + }, + { + "auxiliary_loss_clip": 0.01125897, + "auxiliary_loss_mlp": 0.0108399, + "balance_loss_clip": 1.02514994, + "balance_loss_mlp": 1.0039773, + "epoch": 0.9346479889376541, + "flos": 34313112432000.0, + "grad_norm": 1.7646545869617039, + "language_loss": 0.79346496, + "learning_rate": 4.456188895920565e-08, + "loss": 0.81556392, + "num_input_tokens_seen": 168019625, + "step": 7773, + "time_per_iteration": 2.7833502292633057 + }, + { + "auxiliary_loss_clip": 0.01134355, + "auxiliary_loss_mlp": 0.01083692, + "balance_loss_clip": 1.02544165, + "balance_loss_mlp": 1.00353599, + "epoch": 0.9347682318282932, + "flos": 19093739581440.0, + "grad_norm": 1.9574575183791927, + "language_loss": 0.85327148, + "learning_rate": 4.439851733169031e-08, + "loss": 0.87545192, + "num_input_tokens_seen": 168037415, + "step": 7774, + "time_per_iteration": 2.691349506378174 + }, + { + "auxiliary_loss_clip": 0.01105984, + "auxiliary_loss_mlp": 0.01085324, + "balance_loss_clip": 1.0236702, + "balance_loss_mlp": 1.00516796, + "epoch": 0.9348884747189322, + "flos": 26249587153920.0, + "grad_norm": 2.4816056384086145, + "language_loss": 0.69484401, + "learning_rate": 4.4235442367365204e-08, + "loss": 0.71675706, + "num_input_tokens_seen": 168057725, + "step": 7775, + "time_per_iteration": 2.807372808456421 + }, + { + "auxiliary_loss_clip": 0.01118177, + "auxiliary_loss_mlp": 0.01084061, + "balance_loss_clip": 1.02421153, + "balance_loss_mlp": 1.00390482, + "epoch": 0.9350087176095714, + "flos": 18333080242560.0, + "grad_norm": 1.9475667848571885, + "language_loss": 0.79625511, + "learning_rate": 4.4072664090968545e-08, + "loss": 0.8182776, + "num_input_tokens_seen": 168076110, + "step": 7776, + "time_per_iteration": 2.6640608310699463 + }, + { + "auxiliary_loss_clip": 0.01117375, + "auxiliary_loss_mlp": 0.01083671, + "balance_loss_clip": 1.02385974, + "balance_loss_mlp": 1.00356245, + "epoch": 0.9351289605002104, + "flos": 19318253541120.0, + "grad_norm": 1.797445226442619, + "language_loss": 0.84708214, + "learning_rate": 4.391018252719347e-08, + "loss": 0.86909252, + "num_input_tokens_seen": 168095905, + "step": 7777, + "time_per_iteration": 2.695430278778076 + }, + { + "auxiliary_loss_clip": 0.01118306, + "auxiliary_loss_mlp": 0.01083805, + "balance_loss_clip": 1.02491832, + "balance_loss_mlp": 1.00369596, + "epoch": 0.9352492033908495, + "flos": 18799990156800.0, + "grad_norm": 2.2615636785836633, + "language_loss": 0.69107687, + "learning_rate": 4.374799770068849e-08, + "loss": 0.71309793, + "num_input_tokens_seen": 168112580, + "step": 7778, + "time_per_iteration": 2.7485294342041016 + }, + { + "auxiliary_loss_clip": 0.0112439, + "auxiliary_loss_mlp": 0.01083336, + "balance_loss_clip": 1.02534354, + "balance_loss_mlp": 1.003227, + "epoch": 0.9353694462814887, + "flos": 29530134241920.0, + "grad_norm": 1.9564892591565133, + "language_loss": 0.74461055, + "learning_rate": 4.358610963605658e-08, + "loss": 0.76668787, + "num_input_tokens_seen": 168133030, + "step": 7779, + "time_per_iteration": 2.709512710571289 + }, + { + "auxiliary_loss_clip": 0.01135818, + "auxiliary_loss_mlp": 0.01084038, + "balance_loss_clip": 1.02638173, + "balance_loss_mlp": 1.00388217, + "epoch": 0.9354896891721277, + "flos": 30665450390400.0, + "grad_norm": 1.9774311138798093, + "language_loss": 0.68600738, + "learning_rate": 4.342451835785677e-08, + "loss": 0.70820594, + "num_input_tokens_seen": 168153940, + "step": 7780, + "time_per_iteration": 2.7121903896331787 + }, + { + "auxiliary_loss_clip": 0.01117443, + "auxiliary_loss_mlp": 0.0108318, + "balance_loss_clip": 1.0252254, + "balance_loss_mlp": 1.00316715, + "epoch": 0.9356099320627668, + "flos": 19463907191040.0, + "grad_norm": 1.6150011472769985, + "language_loss": 0.74980086, + "learning_rate": 4.3263223890601665e-08, + "loss": 0.77180707, + "num_input_tokens_seen": 168172650, + "step": 7781, + "time_per_iteration": 2.7440104484558105 + }, + { + "auxiliary_loss_clip": 0.01119398, + "auxiliary_loss_mlp": 0.00872886, + "balance_loss_clip": 1.02556014, + "balance_loss_mlp": 1.00011551, + "epoch": 0.9357301749534058, + "flos": 19098156954240.0, + "grad_norm": 1.8188747640328577, + "language_loss": 0.79566938, + "learning_rate": 4.31022262587597e-08, + "loss": 0.81559229, + "num_input_tokens_seen": 168191325, + "step": 7782, + "time_per_iteration": 2.741809844970703 + }, + { + "auxiliary_loss_clip": 0.01125517, + "auxiliary_loss_mlp": 0.01084764, + "balance_loss_clip": 1.02542722, + "balance_loss_mlp": 1.00465584, + "epoch": 0.935850417844045, + "flos": 23550361776000.0, + "grad_norm": 1.7061010719756597, + "language_loss": 0.66089207, + "learning_rate": 4.2941525486754225e-08, + "loss": 0.68299484, + "num_input_tokens_seen": 168211645, + "step": 7783, + "time_per_iteration": 2.735957384109497 + }, + { + "auxiliary_loss_clip": 0.01098155, + "auxiliary_loss_mlp": 0.01083364, + "balance_loss_clip": 1.02270222, + "balance_loss_mlp": 1.00335097, + "epoch": 0.935970660734684, + "flos": 18588333265920.0, + "grad_norm": 1.8828644544752537, + "language_loss": 0.79354894, + "learning_rate": 4.278112159896286e-08, + "loss": 0.81536412, + "num_input_tokens_seen": 168229485, + "step": 7784, + "time_per_iteration": 2.7745566368103027 + }, + { + "auxiliary_loss_clip": 0.01118198, + "auxiliary_loss_mlp": 0.01083928, + "balance_loss_clip": 1.02518713, + "balance_loss_mlp": 1.00391507, + "epoch": 0.9360909036253231, + "flos": 20631255292800.0, + "grad_norm": 1.843625460838931, + "language_loss": 0.67546874, + "learning_rate": 4.2621014619719896e-08, + "loss": 0.69748998, + "num_input_tokens_seen": 168247250, + "step": 7785, + "time_per_iteration": 2.7449634075164795 + }, + { + "auxiliary_loss_clip": 0.01096388, + "auxiliary_loss_mlp": 0.01078916, + "balance_loss_clip": 1.01743627, + "balance_loss_mlp": 0.99995196, + "epoch": 0.9362111465159623, + "flos": 61791421052160.0, + "grad_norm": 0.7169633793825854, + "language_loss": 0.58613402, + "learning_rate": 4.246120457331215e-08, + "loss": 0.60788703, + "num_input_tokens_seen": 168309425, + "step": 7786, + "time_per_iteration": 3.3875365257263184 + }, + { + "auxiliary_loss_clip": 0.01118176, + "auxiliary_loss_mlp": 0.01085075, + "balance_loss_clip": 1.02600884, + "balance_loss_mlp": 1.00496674, + "epoch": 0.9363313894066013, + "flos": 24170395368960.0, + "grad_norm": 1.8927157202877098, + "language_loss": 0.71853805, + "learning_rate": 4.2301691483983325e-08, + "loss": 0.7405706, + "num_input_tokens_seen": 168329545, + "step": 7787, + "time_per_iteration": 2.7662720680236816 + }, + { + "auxiliary_loss_clip": 0.01126988, + "auxiliary_loss_mlp": 0.01085284, + "balance_loss_clip": 1.02580237, + "balance_loss_mlp": 1.00517499, + "epoch": 0.9364516322972404, + "flos": 20120354196480.0, + "grad_norm": 1.5683543906320827, + "language_loss": 0.75704974, + "learning_rate": 4.214247537593163e-08, + "loss": 0.77917242, + "num_input_tokens_seen": 168348795, + "step": 7788, + "time_per_iteration": 2.717256784439087 + }, + { + "auxiliary_loss_clip": 0.01118998, + "auxiliary_loss_mlp": 0.01084689, + "balance_loss_clip": 1.02562439, + "balance_loss_mlp": 1.00462782, + "epoch": 0.9365718751878795, + "flos": 20703758895360.0, + "grad_norm": 1.741089340441617, + "language_loss": 0.80533683, + "learning_rate": 4.1983556273309293e-08, + "loss": 0.82737374, + "num_input_tokens_seen": 168367545, + "step": 7789, + "time_per_iteration": 2.73551607131958 + }, + { + "auxiliary_loss_clip": 0.01135226, + "auxiliary_loss_mlp": 0.01084122, + "balance_loss_clip": 1.02578425, + "balance_loss_mlp": 1.00396514, + "epoch": 0.9366921180785186, + "flos": 18655270260480.0, + "grad_norm": 2.5123477888404193, + "language_loss": 0.68826425, + "learning_rate": 4.182493420022526e-08, + "loss": 0.71045768, + "num_input_tokens_seen": 168383215, + "step": 7790, + "time_per_iteration": 2.6376099586486816 + }, + { + "auxiliary_loss_clip": 0.01086704, + "auxiliary_loss_mlp": 0.01083226, + "balance_loss_clip": 1.02172685, + "balance_loss_mlp": 1.00326061, + "epoch": 0.9368123609691577, + "flos": 25774955815680.0, + "grad_norm": 1.5795561417443038, + "language_loss": 0.78494012, + "learning_rate": 4.166660918074139e-08, + "loss": 0.80663943, + "num_input_tokens_seen": 168403120, + "step": 7791, + "time_per_iteration": 2.7920162677764893 + }, + { + "auxiliary_loss_clip": 0.01106921, + "auxiliary_loss_mlp": 0.01083745, + "balance_loss_clip": 1.02347231, + "balance_loss_mlp": 1.00363612, + "epoch": 0.9369326038597968, + "flos": 25553386771200.0, + "grad_norm": 1.4085352093021044, + "language_loss": 0.73485231, + "learning_rate": 4.15085812388758e-08, + "loss": 0.75675899, + "num_input_tokens_seen": 168425340, + "step": 7792, + "time_per_iteration": 2.8022305965423584 + }, + { + "auxiliary_loss_clip": 0.01116342, + "auxiliary_loss_mlp": 0.01084107, + "balance_loss_clip": 1.02496815, + "balance_loss_mlp": 1.00404584, + "epoch": 0.9370528467504359, + "flos": 23220019370880.0, + "grad_norm": 1.5865129654601553, + "language_loss": 0.78523165, + "learning_rate": 4.135085039860153e-08, + "loss": 0.80723614, + "num_input_tokens_seen": 168444740, + "step": 7793, + "time_per_iteration": 2.768970012664795 + }, + { + "auxiliary_loss_clip": 0.01114423, + "auxiliary_loss_mlp": 0.0108376, + "balance_loss_clip": 1.0238564, + "balance_loss_mlp": 1.00355577, + "epoch": 0.9371730896410749, + "flos": 24967468120320.0, + "grad_norm": 2.1383354028320034, + "language_loss": 0.78524721, + "learning_rate": 4.1193416683845906e-08, + "loss": 0.80722898, + "num_input_tokens_seen": 168463670, + "step": 7794, + "time_per_iteration": 2.800165891647339 + }, + { + "auxiliary_loss_clip": 0.01107352, + "auxiliary_loss_mlp": 0.01083638, + "balance_loss_clip": 1.0246259, + "balance_loss_mlp": 1.0036248, + "epoch": 0.9372933325317141, + "flos": 15553091134080.0, + "grad_norm": 2.3030438876721706, + "language_loss": 0.83653569, + "learning_rate": 4.103628011849136e-08, + "loss": 0.85844558, + "num_input_tokens_seen": 168479030, + "step": 7795, + "time_per_iteration": 3.7483580112457275 + }, + { + "auxiliary_loss_clip": 0.01116126, + "auxiliary_loss_mlp": 0.01083331, + "balance_loss_clip": 1.0238322, + "balance_loss_mlp": 1.0031271, + "epoch": 0.9374135754223532, + "flos": 21871861182720.0, + "grad_norm": 1.7261114612693202, + "language_loss": 0.75842565, + "learning_rate": 4.0879440726375506e-08, + "loss": 0.78042018, + "num_input_tokens_seen": 168496815, + "step": 7796, + "time_per_iteration": 3.63917875289917 + }, + { + "auxiliary_loss_clip": 0.01118447, + "auxiliary_loss_mlp": 0.01083836, + "balance_loss_clip": 1.02515805, + "balance_loss_mlp": 1.00372744, + "epoch": 0.9375338183129922, + "flos": 22631048064000.0, + "grad_norm": 2.2912608591403503, + "language_loss": 0.55782199, + "learning_rate": 4.0722898531291074e-08, + "loss": 0.57984483, + "num_input_tokens_seen": 168514055, + "step": 7797, + "time_per_iteration": 2.749959707260132 + }, + { + "auxiliary_loss_clip": 0.01115793, + "auxiliary_loss_mlp": 0.01084032, + "balance_loss_clip": 1.02313483, + "balance_loss_mlp": 1.00387621, + "epoch": 0.9376540612036314, + "flos": 26104292640000.0, + "grad_norm": 5.0816134767074805, + "language_loss": 0.76439005, + "learning_rate": 4.0566653556985295e-08, + "loss": 0.78638834, + "num_input_tokens_seen": 168534600, + "step": 7798, + "time_per_iteration": 3.6473278999328613 + }, + { + "auxiliary_loss_clip": 0.01077767, + "auxiliary_loss_mlp": 0.01084741, + "balance_loss_clip": 1.0211308, + "balance_loss_mlp": 1.00458455, + "epoch": 0.9377743040942704, + "flos": 19717580016000.0, + "grad_norm": 2.4518597226890373, + "language_loss": 0.82068825, + "learning_rate": 4.0410705827159886e-08, + "loss": 0.84231329, + "num_input_tokens_seen": 168551895, + "step": 7799, + "time_per_iteration": 2.8738529682159424 + }, + { + "auxiliary_loss_clip": 0.01116606, + "auxiliary_loss_mlp": 0.01085324, + "balance_loss_clip": 1.02308202, + "balance_loss_mlp": 1.00516748, + "epoch": 0.9378945469849095, + "flos": 15267530010240.0, + "grad_norm": 2.306969538359805, + "language_loss": 0.70836675, + "learning_rate": 4.0255055365472356e-08, + "loss": 0.73038602, + "num_input_tokens_seen": 168569990, + "step": 7800, + "time_per_iteration": 2.757269859313965 + }, + { + "auxiliary_loss_clip": 0.01090343, + "auxiliary_loss_mlp": 0.01085374, + "balance_loss_clip": 1.02137351, + "balance_loss_mlp": 1.00521755, + "epoch": 0.9380147898755486, + "flos": 20591394174720.0, + "grad_norm": 2.1427892852911916, + "language_loss": 0.74788064, + "learning_rate": 4.009970219553471e-08, + "loss": 0.76963782, + "num_input_tokens_seen": 168586940, + "step": 7801, + "time_per_iteration": 2.9307141304016113 + }, + { + "auxiliary_loss_clip": 0.0112416, + "auxiliary_loss_mlp": 0.01084068, + "balance_loss_clip": 1.02384424, + "balance_loss_mlp": 1.00381601, + "epoch": 0.9381350327661877, + "flos": 26281116316800.0, + "grad_norm": 2.222594017430718, + "language_loss": 0.7693277, + "learning_rate": 3.99446463409141e-08, + "loss": 0.79141003, + "num_input_tokens_seen": 168604795, + "step": 7802, + "time_per_iteration": 2.7597427368164062 + }, + { + "auxiliary_loss_clip": 0.01126584, + "auxiliary_loss_mlp": 0.01083696, + "balance_loss_clip": 1.0247407, + "balance_loss_mlp": 1.00353932, + "epoch": 0.9382552756568268, + "flos": 23586344225280.0, + "grad_norm": 1.9548036555971047, + "language_loss": 0.69161618, + "learning_rate": 3.978988782513215e-08, + "loss": 0.71371895, + "num_input_tokens_seen": 168622290, + "step": 7803, + "time_per_iteration": 2.693716287612915 + }, + { + "auxiliary_loss_clip": 0.01125444, + "auxiliary_loss_mlp": 0.0108491, + "balance_loss_clip": 1.02454579, + "balance_loss_mlp": 1.00475371, + "epoch": 0.9383755185474659, + "flos": 28438809275520.0, + "grad_norm": 2.2467071337728743, + "language_loss": 0.76058483, + "learning_rate": 3.963542667166586e-08, + "loss": 0.78268838, + "num_input_tokens_seen": 168642395, + "step": 7804, + "time_per_iteration": 2.783134937286377 + }, + { + "auxiliary_loss_clip": 0.01092275, + "auxiliary_loss_mlp": 0.01084681, + "balance_loss_clip": 1.02580547, + "balance_loss_mlp": 1.00462008, + "epoch": 0.938495761438105, + "flos": 20449583280000.0, + "grad_norm": 1.6537997142403504, + "language_loss": 0.68238831, + "learning_rate": 3.9481262903946486e-08, + "loss": 0.70415783, + "num_input_tokens_seen": 168661840, + "step": 7805, + "time_per_iteration": 2.840817928314209 + }, + { + "auxiliary_loss_clip": 0.01080777, + "auxiliary_loss_mlp": 0.01078933, + "balance_loss_clip": 1.01770234, + "balance_loss_mlp": 0.99996853, + "epoch": 0.938616004328744, + "flos": 69302711658240.0, + "grad_norm": 0.7717702166461493, + "language_loss": 0.54540193, + "learning_rate": 3.932739654536066e-08, + "loss": 0.56699908, + "num_input_tokens_seen": 168724540, + "step": 7806, + "time_per_iteration": 3.359743356704712 + }, + { + "auxiliary_loss_clip": 0.01124509, + "auxiliary_loss_mlp": 0.01084352, + "balance_loss_clip": 1.02480876, + "balance_loss_mlp": 1.00433826, + "epoch": 0.9387362472193832, + "flos": 18911636605440.0, + "grad_norm": 3.1424944394878636, + "language_loss": 0.73943102, + "learning_rate": 3.917382761925014e-08, + "loss": 0.76151967, + "num_input_tokens_seen": 168740375, + "step": 7807, + "time_per_iteration": 2.7183001041412354 + }, + { + "auxiliary_loss_clip": 0.01124159, + "auxiliary_loss_mlp": 0.01084329, + "balance_loss_clip": 1.02504349, + "balance_loss_mlp": 1.00431609, + "epoch": 0.9388564901100223, + "flos": 26501967089280.0, + "grad_norm": 1.6851158895796368, + "language_loss": 0.79160404, + "learning_rate": 3.9020556148910754e-08, + "loss": 0.81368887, + "num_input_tokens_seen": 168759730, + "step": 7808, + "time_per_iteration": 2.673245906829834 + }, + { + "auxiliary_loss_clip": 0.01096992, + "auxiliary_loss_mlp": 0.0107904, + "balance_loss_clip": 1.01777673, + "balance_loss_mlp": 1.00007617, + "epoch": 0.9389767330006613, + "flos": 58941083157120.0, + "grad_norm": 0.7055320738353356, + "language_loss": 0.56750214, + "learning_rate": 3.8867582157593895e-08, + "loss": 0.58926249, + "num_input_tokens_seen": 168813935, + "step": 7809, + "time_per_iteration": 3.1809308528900146 + }, + { + "auxiliary_loss_clip": 0.01124207, + "auxiliary_loss_mlp": 0.01083757, + "balance_loss_clip": 1.02511322, + "balance_loss_mlp": 1.0036962, + "epoch": 0.9390969758913005, + "flos": 31102554994560.0, + "grad_norm": 1.6938920805152655, + "language_loss": 0.76420808, + "learning_rate": 3.871490566850544e-08, + "loss": 0.78628767, + "num_input_tokens_seen": 168838145, + "step": 7810, + "time_per_iteration": 2.755093574523926 + }, + { + "auxiliary_loss_clip": 0.01116251, + "auxiliary_loss_mlp": 0.01083927, + "balance_loss_clip": 1.02396226, + "balance_loss_mlp": 1.00377047, + "epoch": 0.9392172187819395, + "flos": 22419391173120.0, + "grad_norm": 1.7541453769789506, + "language_loss": 0.70468795, + "learning_rate": 3.856252670480642e-08, + "loss": 0.7266897, + "num_input_tokens_seen": 168856805, + "step": 7811, + "time_per_iteration": 2.7261204719543457 + }, + { + "auxiliary_loss_clip": 0.01118218, + "auxiliary_loss_mlp": 0.01084738, + "balance_loss_clip": 1.02482438, + "balance_loss_mlp": 1.00462973, + "epoch": 0.9393374616725786, + "flos": 19719483436800.0, + "grad_norm": 1.6752163075492728, + "language_loss": 0.81174695, + "learning_rate": 3.841044528961279e-08, + "loss": 0.83377653, + "num_input_tokens_seen": 168874600, + "step": 7812, + "time_per_iteration": 2.706622362136841 + }, + { + "auxiliary_loss_clip": 0.01135301, + "auxiliary_loss_mlp": 0.0108296, + "balance_loss_clip": 1.025635, + "balance_loss_mlp": 1.00285125, + "epoch": 0.9394577045632178, + "flos": 24170215800960.0, + "grad_norm": 1.8409498069045016, + "language_loss": 0.78554988, + "learning_rate": 3.825866144599477e-08, + "loss": 0.80773246, + "num_input_tokens_seen": 168893655, + "step": 7813, + "time_per_iteration": 2.6185286045074463 + }, + { + "auxiliary_loss_clip": 0.01114873, + "auxiliary_loss_mlp": 0.01084256, + "balance_loss_clip": 1.023242, + "balance_loss_mlp": 1.00409913, + "epoch": 0.9395779474538568, + "flos": 19023929498880.0, + "grad_norm": 1.8402122540717167, + "language_loss": 0.75202358, + "learning_rate": 3.8107175196978145e-08, + "loss": 0.77401483, + "num_input_tokens_seen": 168909960, + "step": 7814, + "time_per_iteration": 2.8234407901763916 + }, + { + "auxiliary_loss_clip": 0.01104461, + "auxiliary_loss_mlp": 0.01085358, + "balance_loss_clip": 1.02215195, + "balance_loss_mlp": 1.00529695, + "epoch": 0.9396981903444959, + "flos": 14319129260160.0, + "grad_norm": 1.7486359691593016, + "language_loss": 0.76811099, + "learning_rate": 3.7955986565542996e-08, + "loss": 0.79000914, + "num_input_tokens_seen": 168928040, + "step": 7815, + "time_per_iteration": 2.74338698387146 + }, + { + "auxiliary_loss_clip": 0.01106075, + "auxiliary_loss_mlp": 0.0108338, + "balance_loss_clip": 1.02290511, + "balance_loss_mlp": 1.0033195, + "epoch": 0.9398184332351349, + "flos": 34787564202240.0, + "grad_norm": 1.9120115576214156, + "language_loss": 0.67989707, + "learning_rate": 3.780509557462497e-08, + "loss": 0.70179164, + "num_input_tokens_seen": 168948240, + "step": 7816, + "time_per_iteration": 2.832568883895874 + }, + { + "auxiliary_loss_clip": 0.01108749, + "auxiliary_loss_mlp": 0.01084498, + "balance_loss_clip": 1.02288246, + "balance_loss_mlp": 1.0043416, + "epoch": 0.9399386761257741, + "flos": 25372253462400.0, + "grad_norm": 1.736817829294174, + "language_loss": 0.75804806, + "learning_rate": 3.765450224711375e-08, + "loss": 0.77998048, + "num_input_tokens_seen": 168968745, + "step": 7817, + "time_per_iteration": 2.7171525955200195 + }, + { + "auxiliary_loss_clip": 0.01108726, + "auxiliary_loss_mlp": 0.01083361, + "balance_loss_clip": 1.02039599, + "balance_loss_mlp": 1.00325239, + "epoch": 0.9400589190164131, + "flos": 27304965584640.0, + "grad_norm": 1.5775849812280212, + "language_loss": 0.79701442, + "learning_rate": 3.750420660585396e-08, + "loss": 0.81893528, + "num_input_tokens_seen": 168990685, + "step": 7818, + "time_per_iteration": 2.8488311767578125 + }, + { + "auxiliary_loss_clip": 0.01135551, + "auxiliary_loss_mlp": 0.01085224, + "balance_loss_clip": 1.0263412, + "balance_loss_mlp": 1.00506759, + "epoch": 0.9401791619070522, + "flos": 23399859790080.0, + "grad_norm": 1.6104874465682664, + "language_loss": 0.79899526, + "learning_rate": 3.735420867364603e-08, + "loss": 0.82120299, + "num_input_tokens_seen": 169011665, + "step": 7819, + "time_per_iteration": 2.6218619346618652 + }, + { + "auxiliary_loss_clip": 0.01087109, + "auxiliary_loss_mlp": 0.0108298, + "balance_loss_clip": 1.02079356, + "balance_loss_mlp": 1.00296712, + "epoch": 0.9402994047976914, + "flos": 35881403120640.0, + "grad_norm": 1.5492300514045836, + "language_loss": 0.61452401, + "learning_rate": 3.7204508473244186e-08, + "loss": 0.63622499, + "num_input_tokens_seen": 169035290, + "step": 7820, + "time_per_iteration": 3.769090175628662 + }, + { + "auxiliary_loss_clip": 0.01068603, + "auxiliary_loss_mlp": 0.01084335, + "balance_loss_clip": 1.01995778, + "balance_loss_mlp": 1.00441742, + "epoch": 0.9404196476883304, + "flos": 22236821320320.0, + "grad_norm": 1.522238116245065, + "language_loss": 0.69279718, + "learning_rate": 3.7055106027357395e-08, + "loss": 0.71432656, + "num_input_tokens_seen": 169055155, + "step": 7821, + "time_per_iteration": 3.779540777206421 + }, + { + "auxiliary_loss_clip": 0.01125539, + "auxiliary_loss_mlp": 0.01083824, + "balance_loss_clip": 1.02526164, + "balance_loss_mlp": 1.00376308, + "epoch": 0.9405398905789695, + "flos": 18915802583040.0, + "grad_norm": 2.826697023268165, + "language_loss": 0.71802765, + "learning_rate": 3.690600135865063e-08, + "loss": 0.7401213, + "num_input_tokens_seen": 169072080, + "step": 7822, + "time_per_iteration": 2.628650426864624 + }, + { + "auxiliary_loss_clip": 0.01063055, + "auxiliary_loss_mlp": 0.01079355, + "balance_loss_clip": 1.01774883, + "balance_loss_mlp": 1.00039124, + "epoch": 0.9406601334696086, + "flos": 70274130048000.0, + "grad_norm": 0.7892046254631083, + "language_loss": 0.58166569, + "learning_rate": 3.675719448974246e-08, + "loss": 0.60308981, + "num_input_tokens_seen": 169137170, + "step": 7823, + "time_per_iteration": 4.301343679428101 + }, + { + "auxiliary_loss_clip": 0.01093648, + "auxiliary_loss_mlp": 0.00872863, + "balance_loss_clip": 1.0205493, + "balance_loss_mlp": 1.00002551, + "epoch": 0.9407803763602477, + "flos": 22165071903360.0, + "grad_norm": 2.6592663510150127, + "language_loss": 0.6026504, + "learning_rate": 3.6608685443207054e-08, + "loss": 0.62231553, + "num_input_tokens_seen": 169156320, + "step": 7824, + "time_per_iteration": 2.8747565746307373 + }, + { + "auxiliary_loss_clip": 0.01107763, + "auxiliary_loss_mlp": 0.01084211, + "balance_loss_clip": 1.02420521, + "balance_loss_mlp": 1.00405526, + "epoch": 0.9409006192508867, + "flos": 18879496911360.0, + "grad_norm": 2.3692228871399323, + "language_loss": 0.66472447, + "learning_rate": 3.646047424157306e-08, + "loss": 0.6866442, + "num_input_tokens_seen": 169173295, + "step": 7825, + "time_per_iteration": 2.7649407386779785 + }, + { + "auxiliary_loss_clip": 0.01108075, + "auxiliary_loss_mlp": 0.010845, + "balance_loss_clip": 1.02292073, + "balance_loss_mlp": 1.00429606, + "epoch": 0.9410208621415259, + "flos": 23368258800000.0, + "grad_norm": 2.288226883249012, + "language_loss": 0.68457919, + "learning_rate": 3.631256090732382e-08, + "loss": 0.70650494, + "num_input_tokens_seen": 169193755, + "step": 7826, + "time_per_iteration": 2.733550548553467 + }, + { + "auxiliary_loss_clip": 0.01108047, + "auxiliary_loss_mlp": 0.01084849, + "balance_loss_clip": 1.02437806, + "balance_loss_mlp": 1.00474083, + "epoch": 0.941141105032165, + "flos": 22742227635840.0, + "grad_norm": 1.574638483037183, + "language_loss": 0.82609606, + "learning_rate": 3.6164945462897833e-08, + "loss": 0.84802508, + "num_input_tokens_seen": 169213045, + "step": 7827, + "time_per_iteration": 2.8451590538024902 + }, + { + "auxiliary_loss_clip": 0.01124623, + "auxiliary_loss_mlp": 0.00872849, + "balance_loss_clip": 1.02498412, + "balance_loss_mlp": 1.00008249, + "epoch": 0.941261347922804, + "flos": 20704908130560.0, + "grad_norm": 1.7384728233288282, + "language_loss": 0.75625056, + "learning_rate": 3.6017627930687856e-08, + "loss": 0.77622527, + "num_input_tokens_seen": 169232870, + "step": 7828, + "time_per_iteration": 2.696363687515259 + }, + { + "auxiliary_loss_clip": 0.01097584, + "auxiliary_loss_mlp": 0.0108359, + "balance_loss_clip": 1.02249742, + "balance_loss_mlp": 1.00352859, + "epoch": 0.9413815908134432, + "flos": 19421998997760.0, + "grad_norm": 2.101131463205569, + "language_loss": 0.77376008, + "learning_rate": 3.587060833304267e-08, + "loss": 0.79557186, + "num_input_tokens_seen": 169251060, + "step": 7829, + "time_per_iteration": 2.837407112121582 + }, + { + "auxiliary_loss_clip": 0.01126777, + "auxiliary_loss_mlp": 0.01084686, + "balance_loss_clip": 1.02610409, + "balance_loss_mlp": 1.00457776, + "epoch": 0.9415018337040822, + "flos": 17493452853120.0, + "grad_norm": 1.881933212714145, + "language_loss": 0.64047694, + "learning_rate": 3.5723886692264225e-08, + "loss": 0.66259158, + "num_input_tokens_seen": 169268600, + "step": 7830, + "time_per_iteration": 2.7191178798675537 + }, + { + "auxiliary_loss_clip": 0.01116377, + "auxiliary_loss_mlp": 0.01084453, + "balance_loss_clip": 1.02449608, + "balance_loss_mlp": 1.00434446, + "epoch": 0.9416220765947213, + "flos": 31831613343360.0, + "grad_norm": 1.8764208386589034, + "language_loss": 0.61496913, + "learning_rate": 3.557746303061071e-08, + "loss": 0.63697743, + "num_input_tokens_seen": 169290355, + "step": 7831, + "time_per_iteration": 2.8559134006500244 + }, + { + "auxiliary_loss_clip": 0.01114743, + "auxiliary_loss_mlp": 0.01083931, + "balance_loss_clip": 1.02337635, + "balance_loss_mlp": 1.00386977, + "epoch": 0.9417423194853605, + "flos": 23511973115520.0, + "grad_norm": 1.6164645525806691, + "language_loss": 0.72410655, + "learning_rate": 3.543133737029391e-08, + "loss": 0.74609327, + "num_input_tokens_seen": 169310865, + "step": 7832, + "time_per_iteration": 2.751140832901001 + }, + { + "auxiliary_loss_clip": 0.01125469, + "auxiliary_loss_mlp": 0.01083985, + "balance_loss_clip": 1.0243994, + "balance_loss_mlp": 1.00382841, + "epoch": 0.9418625623759995, + "flos": 23915106432000.0, + "grad_norm": 1.6017729043310236, + "language_loss": 0.69018281, + "learning_rate": 3.5285509733481214e-08, + "loss": 0.71227741, + "num_input_tokens_seen": 169330590, + "step": 7833, + "time_per_iteration": 2.6802475452423096 + }, + { + "auxiliary_loss_clip": 0.01127557, + "auxiliary_loss_mlp": 0.01084362, + "balance_loss_clip": 1.02600074, + "balance_loss_mlp": 1.00420594, + "epoch": 0.9419828052666386, + "flos": 18076965292800.0, + "grad_norm": 1.615069152657317, + "language_loss": 0.76397216, + "learning_rate": 3.513998014229469e-08, + "loss": 0.78609145, + "num_input_tokens_seen": 169349540, + "step": 7834, + "time_per_iteration": 2.7794229984283447 + }, + { + "auxiliary_loss_clip": 0.01100563, + "auxiliary_loss_mlp": 0.01083326, + "balance_loss_clip": 1.02543402, + "balance_loss_mlp": 1.00331259, + "epoch": 0.9421030481572777, + "flos": 17712328377600.0, + "grad_norm": 2.6987026675597563, + "language_loss": 0.86374611, + "learning_rate": 3.499474861881069e-08, + "loss": 0.88558495, + "num_input_tokens_seen": 169366765, + "step": 7835, + "time_per_iteration": 2.6939003467559814 + }, + { + "auxiliary_loss_clip": 0.01087058, + "auxiliary_loss_mlp": 0.01083634, + "balance_loss_clip": 1.02269757, + "balance_loss_mlp": 1.00357294, + "epoch": 0.9422232910479168, + "flos": 20194114775040.0, + "grad_norm": 2.113043479931812, + "language_loss": 0.68091345, + "learning_rate": 3.4849815185061136e-08, + "loss": 0.70262039, + "num_input_tokens_seen": 169386655, + "step": 7836, + "time_per_iteration": 2.824702262878418 + }, + { + "auxiliary_loss_clip": 0.01126732, + "auxiliary_loss_mlp": 0.01083253, + "balance_loss_clip": 1.02572489, + "balance_loss_mlp": 1.00323975, + "epoch": 0.9423435339385559, + "flos": 18442571875200.0, + "grad_norm": 1.7729737732496436, + "language_loss": 0.75678623, + "learning_rate": 3.470517986303223e-08, + "loss": 0.77888608, + "num_input_tokens_seen": 169405640, + "step": 7837, + "time_per_iteration": 2.7382071018218994 + }, + { + "auxiliary_loss_clip": 0.0110678, + "auxiliary_loss_mlp": 0.01085207, + "balance_loss_clip": 1.02434444, + "balance_loss_mlp": 1.00500321, + "epoch": 0.942463776829195, + "flos": 20080636732800.0, + "grad_norm": 1.9210429219828566, + "language_loss": 0.79457641, + "learning_rate": 3.4560842674664856e-08, + "loss": 0.81649631, + "num_input_tokens_seen": 169424155, + "step": 7838, + "time_per_iteration": 2.7656967639923096 + }, + { + "auxiliary_loss_clip": 0.01125628, + "auxiliary_loss_mlp": 0.01085039, + "balance_loss_clip": 1.02471185, + "balance_loss_mlp": 1.00483537, + "epoch": 0.9425840197198341, + "flos": 22636255536000.0, + "grad_norm": 1.7843379241964459, + "language_loss": 0.75427574, + "learning_rate": 3.441680364185506e-08, + "loss": 0.77638245, + "num_input_tokens_seen": 169444025, + "step": 7839, + "time_per_iteration": 2.748699188232422 + }, + { + "auxiliary_loss_clip": 0.01116148, + "auxiliary_loss_mlp": 0.01084264, + "balance_loss_clip": 1.02412128, + "balance_loss_mlp": 1.00405955, + "epoch": 0.9427042626104731, + "flos": 19937892084480.0, + "grad_norm": 1.9677492057275112, + "language_loss": 0.74367595, + "learning_rate": 3.427306278645314e-08, + "loss": 0.76568007, + "num_input_tokens_seen": 169462480, + "step": 7840, + "time_per_iteration": 2.7233142852783203 + }, + { + "auxiliary_loss_clip": 0.01094767, + "auxiliary_loss_mlp": 0.01083551, + "balance_loss_clip": 1.02218819, + "balance_loss_mlp": 1.00353765, + "epoch": 0.9428245055011123, + "flos": 22856998567680.0, + "grad_norm": 1.7234088220097592, + "language_loss": 0.73026669, + "learning_rate": 3.4129620130264767e-08, + "loss": 0.7520498, + "num_input_tokens_seen": 169480840, + "step": 7841, + "time_per_iteration": 2.7559807300567627 + }, + { + "auxiliary_loss_clip": 0.01116665, + "auxiliary_loss_mlp": 0.00872855, + "balance_loss_clip": 1.0247643, + "balance_loss_mlp": 1.00008917, + "epoch": 0.9429447483917514, + "flos": 20951757371520.0, + "grad_norm": 2.1095599834643672, + "language_loss": 0.77862942, + "learning_rate": 3.398647569505009e-08, + "loss": 0.79852462, + "num_input_tokens_seen": 169498265, + "step": 7842, + "time_per_iteration": 2.722963333129883 + }, + { + "auxiliary_loss_clip": 0.01105733, + "auxiliary_loss_mlp": 0.01083139, + "balance_loss_clip": 1.0224309, + "balance_loss_mlp": 1.00293469, + "epoch": 0.9430649912823904, + "flos": 18843658116480.0, + "grad_norm": 2.1204778695638553, + "language_loss": 0.7514919, + "learning_rate": 3.384362950252373e-08, + "loss": 0.77338064, + "num_input_tokens_seen": 169515235, + "step": 7843, + "time_per_iteration": 2.735491991043091 + }, + { + "auxiliary_loss_clip": 0.01115799, + "auxiliary_loss_mlp": 0.01083876, + "balance_loss_clip": 1.02419388, + "balance_loss_mlp": 1.0039103, + "epoch": 0.9431852341730296, + "flos": 32556038837760.0, + "grad_norm": 1.7878810625121369, + "language_loss": 0.56932425, + "learning_rate": 3.3701081574355473e-08, + "loss": 0.59132099, + "num_input_tokens_seen": 169537195, + "step": 7844, + "time_per_iteration": 2.780099868774414 + }, + { + "auxiliary_loss_clip": 0.01079204, + "auxiliary_loss_mlp": 0.01078873, + "balance_loss_clip": 1.01728892, + "balance_loss_mlp": 0.99990839, + "epoch": 0.9433054770636686, + "flos": 66904490252160.0, + "grad_norm": 0.6408022837744337, + "language_loss": 0.51675373, + "learning_rate": 3.3558831932169796e-08, + "loss": 0.53833449, + "num_input_tokens_seen": 169605865, + "step": 7845, + "time_per_iteration": 4.2399373054504395 + }, + { + "auxiliary_loss_clip": 0.01124874, + "auxiliary_loss_mlp": 0.01084511, + "balance_loss_clip": 1.02527428, + "balance_loss_mlp": 1.00435424, + "epoch": 0.9434257199543077, + "flos": 26140346916480.0, + "grad_norm": 1.7480774954890101, + "language_loss": 0.88528132, + "learning_rate": 3.341688059754588e-08, + "loss": 0.90737522, + "num_input_tokens_seen": 169621520, + "step": 7846, + "time_per_iteration": 4.656937837600708 + }, + { + "auxiliary_loss_clip": 0.01108451, + "auxiliary_loss_mlp": 0.00872878, + "balance_loss_clip": 1.02373815, + "balance_loss_mlp": 1.0000912, + "epoch": 0.9435459628449467, + "flos": 25003486483200.0, + "grad_norm": 2.37854288670437, + "language_loss": 0.77999365, + "learning_rate": 3.327522759201762e-08, + "loss": 0.79980695, + "num_input_tokens_seen": 169641390, + "step": 7847, + "time_per_iteration": 2.8318800926208496 + }, + { + "auxiliary_loss_clip": 0.01100638, + "auxiliary_loss_mlp": 0.01083533, + "balance_loss_clip": 1.02007961, + "balance_loss_mlp": 1.00347209, + "epoch": 0.9436662057355859, + "flos": 22163240309760.0, + "grad_norm": 2.2261548275115834, + "language_loss": 0.66649497, + "learning_rate": 3.313387293707359e-08, + "loss": 0.68833661, + "num_input_tokens_seen": 169660095, + "step": 7848, + "time_per_iteration": 2.7617568969726562 + }, + { + "auxiliary_loss_clip": 0.01102476, + "auxiliary_loss_mlp": 0.01083039, + "balance_loss_clip": 1.0208118, + "balance_loss_mlp": 1.00278711, + "epoch": 0.943786448626225, + "flos": 20118522602880.0, + "grad_norm": 2.087975137747562, + "language_loss": 0.68648958, + "learning_rate": 3.29928166541571e-08, + "loss": 0.7083447, + "num_input_tokens_seen": 169679050, + "step": 7849, + "time_per_iteration": 3.7398793697357178 + }, + { + "auxiliary_loss_clip": 0.01110854, + "auxiliary_loss_mlp": 0.01084398, + "balance_loss_clip": 1.02452731, + "balance_loss_mlp": 1.00438523, + "epoch": 0.943906691516864, + "flos": 22090808534400.0, + "grad_norm": 1.8993545007988852, + "language_loss": 0.80488503, + "learning_rate": 3.2852058764666346e-08, + "loss": 0.82683754, + "num_input_tokens_seen": 169698150, + "step": 7850, + "time_per_iteration": 2.7216861248016357 + }, + { + "auxiliary_loss_clip": 0.01096518, + "auxiliary_loss_mlp": 0.0108367, + "balance_loss_clip": 1.02216697, + "balance_loss_mlp": 1.00370431, + "epoch": 0.9440269344075032, + "flos": 35298501212160.0, + "grad_norm": 1.7933682727357403, + "language_loss": 0.68591666, + "learning_rate": 3.2711599289954264e-08, + "loss": 0.70771855, + "num_input_tokens_seen": 169722185, + "step": 7851, + "time_per_iteration": 2.889068603515625 + }, + { + "auxiliary_loss_clip": 0.01087097, + "auxiliary_loss_mlp": 0.01083725, + "balance_loss_clip": 1.02209127, + "balance_loss_mlp": 1.00366414, + "epoch": 0.9441471772981422, + "flos": 19238136255360.0, + "grad_norm": 1.9615597676380836, + "language_loss": 0.77862793, + "learning_rate": 3.257143825132847e-08, + "loss": 0.80033618, + "num_input_tokens_seen": 169740355, + "step": 7852, + "time_per_iteration": 2.8262689113616943 + }, + { + "auxiliary_loss_clip": 0.01116163, + "auxiliary_loss_mlp": 0.01084368, + "balance_loss_clip": 1.0244571, + "balance_loss_mlp": 1.00435448, + "epoch": 0.9442674201887813, + "flos": 25739799379200.0, + "grad_norm": 1.6644876320173345, + "language_loss": 0.76414084, + "learning_rate": 3.243157567005106e-08, + "loss": 0.78614616, + "num_input_tokens_seen": 169758535, + "step": 7853, + "time_per_iteration": 2.7527244091033936 + }, + { + "auxiliary_loss_clip": 0.01138442, + "auxiliary_loss_mlp": 0.01084177, + "balance_loss_clip": 1.02920985, + "balance_loss_mlp": 1.00406873, + "epoch": 0.9443876630794205, + "flos": 15523321737600.0, + "grad_norm": 2.218374717081032, + "language_loss": 0.63816762, + "learning_rate": 3.2292011567339296e-08, + "loss": 0.66039383, + "num_input_tokens_seen": 169776340, + "step": 7854, + "time_per_iteration": 2.6586813926696777 + }, + { + "auxiliary_loss_clip": 0.0112683, + "auxiliary_loss_mlp": 0.00872831, + "balance_loss_clip": 1.02538633, + "balance_loss_mlp": 1.00010431, + "epoch": 0.9445079059700595, + "flos": 13400821128960.0, + "grad_norm": 2.386135470144085, + "language_loss": 0.56266004, + "learning_rate": 3.21527459643649e-08, + "loss": 0.58265668, + "num_input_tokens_seen": 169793225, + "step": 7855, + "time_per_iteration": 2.694519519805908 + }, + { + "auxiliary_loss_clip": 0.01125948, + "auxiliary_loss_mlp": 0.01083711, + "balance_loss_clip": 1.02543545, + "balance_loss_mlp": 1.00360227, + "epoch": 0.9446281488606986, + "flos": 23659242877440.0, + "grad_norm": 1.9100108009703216, + "language_loss": 0.73863053, + "learning_rate": 3.2013778882254536e-08, + "loss": 0.76072711, + "num_input_tokens_seen": 169812020, + "step": 7856, + "time_per_iteration": 2.6580657958984375 + }, + { + "auxiliary_loss_clip": 0.01128053, + "auxiliary_loss_mlp": 0.01083703, + "balance_loss_clip": 1.02681565, + "balance_loss_mlp": 1.00359464, + "epoch": 0.9447483917513377, + "flos": 25557337267200.0, + "grad_norm": 1.9032432200746723, + "language_loss": 0.75381875, + "learning_rate": 3.1875110342088676e-08, + "loss": 0.77593631, + "num_input_tokens_seen": 169833470, + "step": 7857, + "time_per_iteration": 2.75638484954834 + }, + { + "auxiliary_loss_clip": 0.01114305, + "auxiliary_loss_mlp": 0.01083074, + "balance_loss_clip": 1.02351487, + "balance_loss_mlp": 1.00306082, + "epoch": 0.9448686346419768, + "flos": 24535463247360.0, + "grad_norm": 2.2962423216050176, + "language_loss": 0.65549082, + "learning_rate": 3.1736740364904035e-08, + "loss": 0.6774646, + "num_input_tokens_seen": 169854000, + "step": 7858, + "time_per_iteration": 2.7710630893707275 + }, + { + "auxiliary_loss_clip": 0.0110024, + "auxiliary_loss_mlp": 0.00872864, + "balance_loss_clip": 1.02515829, + "balance_loss_mlp": 1.000085, + "epoch": 0.9449888775326158, + "flos": 14721256995840.0, + "grad_norm": 1.9730042966623011, + "language_loss": 0.77183801, + "learning_rate": 3.159866897169094e-08, + "loss": 0.79156899, + "num_input_tokens_seen": 169872200, + "step": 7859, + "time_per_iteration": 2.815537214279175 + }, + { + "auxiliary_loss_clip": 0.01089641, + "auxiliary_loss_mlp": 0.01085203, + "balance_loss_clip": 1.02290797, + "balance_loss_mlp": 1.00504708, + "epoch": 0.945109120423255, + "flos": 15447873219840.0, + "grad_norm": 1.8862124940074492, + "language_loss": 0.75566339, + "learning_rate": 3.146089618339487e-08, + "loss": 0.77741182, + "num_input_tokens_seen": 169889055, + "step": 7860, + "time_per_iteration": 2.8626296520233154 + }, + { + "auxiliary_loss_clip": 0.01107412, + "auxiliary_loss_mlp": 0.0108407, + "balance_loss_clip": 1.02351522, + "balance_loss_mlp": 1.00396121, + "epoch": 0.9452293633138941, + "flos": 25448097029760.0, + "grad_norm": 1.6886841745452914, + "language_loss": 0.67747164, + "learning_rate": 3.132342202091554e-08, + "loss": 0.69938642, + "num_input_tokens_seen": 169909280, + "step": 7861, + "time_per_iteration": 2.766942024230957 + }, + { + "auxiliary_loss_clip": 0.01134643, + "auxiliary_loss_mlp": 0.01084964, + "balance_loss_clip": 1.02503467, + "balance_loss_mlp": 1.00485563, + "epoch": 0.9453496062045331, + "flos": 21215342350080.0, + "grad_norm": 2.0761749349077068, + "language_loss": 0.68340188, + "learning_rate": 3.1186246505107595e-08, + "loss": 0.70559794, + "num_input_tokens_seen": 169928420, + "step": 7862, + "time_per_iteration": 2.6791961193084717 + }, + { + "auxiliary_loss_clip": 0.01124338, + "auxiliary_loss_mlp": 0.01084071, + "balance_loss_clip": 1.02515745, + "balance_loss_mlp": 1.00396204, + "epoch": 0.9454698490951723, + "flos": 20010898477440.0, + "grad_norm": 1.6402673856433065, + "language_loss": 0.83807003, + "learning_rate": 3.104936965678084e-08, + "loss": 0.86015403, + "num_input_tokens_seen": 169946750, + "step": 7863, + "time_per_iteration": 2.6780130863189697 + }, + { + "auxiliary_loss_clip": 0.01124124, + "auxiliary_loss_mlp": 0.0108446, + "balance_loss_clip": 1.02396035, + "balance_loss_mlp": 1.00439858, + "epoch": 0.9455900919858113, + "flos": 21069652786560.0, + "grad_norm": 1.8815086566530026, + "language_loss": 0.8206194, + "learning_rate": 3.091279149669956e-08, + "loss": 0.84270525, + "num_input_tokens_seen": 169965540, + "step": 7864, + "time_per_iteration": 2.667238235473633 + }, + { + "auxiliary_loss_clip": 0.01125279, + "auxiliary_loss_mlp": 0.00872911, + "balance_loss_clip": 1.02489233, + "balance_loss_mlp": 1.00007844, + "epoch": 0.9457103348764504, + "flos": 20740854666240.0, + "grad_norm": 1.8822460652811093, + "language_loss": 0.7330426, + "learning_rate": 3.0776512045581624e-08, + "loss": 0.75302446, + "num_input_tokens_seen": 169984330, + "step": 7865, + "time_per_iteration": 2.672445058822632 + }, + { + "auxiliary_loss_clip": 0.01117182, + "auxiliary_loss_mlp": 0.01085225, + "balance_loss_clip": 1.02508593, + "balance_loss_mlp": 1.00502086, + "epoch": 0.9458305777670896, + "flos": 21428363957760.0, + "grad_norm": 1.9118818119494372, + "language_loss": 0.77843416, + "learning_rate": 3.0640531324101384e-08, + "loss": 0.80045819, + "num_input_tokens_seen": 170002095, + "step": 7866, + "time_per_iteration": 2.827608346939087 + }, + { + "auxiliary_loss_clip": 0.01126112, + "auxiliary_loss_mlp": 0.01084865, + "balance_loss_clip": 1.0266118, + "balance_loss_mlp": 1.00470889, + "epoch": 0.9459508206577286, + "flos": 20011185786240.0, + "grad_norm": 1.7594731112383335, + "language_loss": 0.76015306, + "learning_rate": 3.0504849352886554e-08, + "loss": 0.7822628, + "num_input_tokens_seen": 170020240, + "step": 7867, + "time_per_iteration": 2.656494140625 + }, + { + "auxiliary_loss_clip": 0.01125834, + "auxiliary_loss_mlp": 0.0108495, + "balance_loss_clip": 1.0250361, + "balance_loss_mlp": 1.00484109, + "epoch": 0.9460710635483677, + "flos": 12166428291840.0, + "grad_norm": 2.2097245157439693, + "language_loss": 0.71247077, + "learning_rate": 3.036946615252023e-08, + "loss": 0.73457861, + "num_input_tokens_seen": 170035770, + "step": 7868, + "time_per_iteration": 2.650540590286255 + }, + { + "auxiliary_loss_clip": 0.01100315, + "auxiliary_loss_mlp": 0.01083818, + "balance_loss_clip": 1.02475011, + "balance_loss_mlp": 1.00370955, + "epoch": 0.9461913064390068, + "flos": 34276196229120.0, + "grad_norm": 2.206288643088609, + "language_loss": 0.66692221, + "learning_rate": 3.0234381743539984e-08, + "loss": 0.6887635, + "num_input_tokens_seen": 170053385, + "step": 7869, + "time_per_iteration": 2.8928136825561523 + }, + { + "auxiliary_loss_clip": 0.01118298, + "auxiliary_loss_mlp": 0.01083966, + "balance_loss_clip": 1.02490187, + "balance_loss_mlp": 1.00385702, + "epoch": 0.9463115493296459, + "flos": 19463763536640.0, + "grad_norm": 1.8957357329722138, + "language_loss": 0.79963952, + "learning_rate": 3.0099596146437863e-08, + "loss": 0.82166213, + "num_input_tokens_seen": 170070490, + "step": 7870, + "time_per_iteration": 2.706354856491089 + }, + { + "auxiliary_loss_clip": 0.01112814, + "auxiliary_loss_mlp": 0.01078872, + "balance_loss_clip": 1.01766515, + "balance_loss_mlp": 0.99990767, + "epoch": 0.946431792220285, + "flos": 70570824387840.0, + "grad_norm": 0.7793514638797286, + "language_loss": 0.60142612, + "learning_rate": 2.996510938166086e-08, + "loss": 0.62334299, + "num_input_tokens_seen": 170133465, + "step": 7871, + "time_per_iteration": 4.989814043045044 + }, + { + "auxiliary_loss_clip": 0.01126472, + "auxiliary_loss_mlp": 0.01084018, + "balance_loss_clip": 1.02682412, + "balance_loss_mlp": 1.00386214, + "epoch": 0.9465520351109241, + "flos": 18947906363520.0, + "grad_norm": 1.7850352633766193, + "language_loss": 0.73384225, + "learning_rate": 2.983092146960997e-08, + "loss": 0.75594723, + "num_input_tokens_seen": 170150810, + "step": 7872, + "time_per_iteration": 3.674880027770996 + }, + { + "auxiliary_loss_clip": 0.01118848, + "auxiliary_loss_mlp": 0.01084558, + "balance_loss_clip": 1.02533293, + "balance_loss_mlp": 1.004354, + "epoch": 0.9466722780015632, + "flos": 19135647774720.0, + "grad_norm": 2.4314287515393285, + "language_loss": 0.79762995, + "learning_rate": 2.9697032430642256e-08, + "loss": 0.81966412, + "num_input_tokens_seen": 170169025, + "step": 7873, + "time_per_iteration": 2.7131617069244385 + }, + { + "auxiliary_loss_clip": 0.01133838, + "auxiliary_loss_mlp": 0.01083058, + "balance_loss_clip": 1.02519953, + "balance_loss_mlp": 1.00299668, + "epoch": 0.9467925208922022, + "flos": 17237912520960.0, + "grad_norm": 2.3347210872215216, + "language_loss": 0.73539054, + "learning_rate": 2.9563442285067906e-08, + "loss": 0.75755954, + "num_input_tokens_seen": 170186070, + "step": 7874, + "time_per_iteration": 2.601170063018799 + }, + { + "auxiliary_loss_clip": 0.01125287, + "auxiliary_loss_mlp": 0.010842, + "balance_loss_clip": 1.02482688, + "balance_loss_mlp": 1.00399649, + "epoch": 0.9469127637828414, + "flos": 29169016859520.0, + "grad_norm": 7.081255259147599, + "language_loss": 0.79208696, + "learning_rate": 2.943015105315294e-08, + "loss": 0.8141818, + "num_input_tokens_seen": 170206265, + "step": 7875, + "time_per_iteration": 3.7053751945495605 + }, + { + "auxiliary_loss_clip": 0.01090445, + "auxiliary_loss_mlp": 0.01085281, + "balance_loss_clip": 1.02158427, + "balance_loss_mlp": 1.00512481, + "epoch": 0.9470330066734804, + "flos": 26030460234240.0, + "grad_norm": 2.3838674546359, + "language_loss": 0.66326946, + "learning_rate": 2.929715875511718e-08, + "loss": 0.68502676, + "num_input_tokens_seen": 170225300, + "step": 7876, + "time_per_iteration": 2.802385091781616 + }, + { + "auxiliary_loss_clip": 0.01125534, + "auxiliary_loss_mlp": 0.01084464, + "balance_loss_clip": 1.02412081, + "balance_loss_mlp": 1.00430775, + "epoch": 0.9471532495641195, + "flos": 23440906056960.0, + "grad_norm": 1.8732907194639488, + "language_loss": 0.69991529, + "learning_rate": 2.9164465411135375e-08, + "loss": 0.72201526, + "num_input_tokens_seen": 170245070, + "step": 7877, + "time_per_iteration": 2.75041127204895 + }, + { + "auxiliary_loss_clip": 0.01124835, + "auxiliary_loss_mlp": 0.0108436, + "balance_loss_clip": 1.02498591, + "balance_loss_mlp": 1.004251, + "epoch": 0.9472734924547586, + "flos": 15815850099840.0, + "grad_norm": 1.7157915898304081, + "language_loss": 0.80983698, + "learning_rate": 2.9032071041337426e-08, + "loss": 0.83192891, + "num_input_tokens_seen": 170263305, + "step": 7878, + "time_per_iteration": 2.645768880844116 + }, + { + "auxiliary_loss_clip": 0.01118023, + "auxiliary_loss_mlp": 0.01083809, + "balance_loss_clip": 1.02534866, + "balance_loss_mlp": 1.00374818, + "epoch": 0.9473937353453977, + "flos": 11181793697280.0, + "grad_norm": 1.843710389969884, + "language_loss": 0.72890198, + "learning_rate": 2.889997566580704e-08, + "loss": 0.7509203, + "num_input_tokens_seen": 170281460, + "step": 7879, + "time_per_iteration": 2.7314069271087646 + }, + { + "auxiliary_loss_clip": 0.01134736, + "auxiliary_loss_mlp": 0.01083923, + "balance_loss_clip": 1.02549613, + "balance_loss_mlp": 1.00371933, + "epoch": 0.9475139782360368, + "flos": 25775530433280.0, + "grad_norm": 1.5854150503114368, + "language_loss": 0.70076597, + "learning_rate": 2.8768179304583086e-08, + "loss": 0.7229526, + "num_input_tokens_seen": 170303515, + "step": 7880, + "time_per_iteration": 2.683877468109131 + }, + { + "auxiliary_loss_clip": 0.0110679, + "auxiliary_loss_mlp": 0.01085667, + "balance_loss_clip": 1.02416515, + "balance_loss_mlp": 1.00551081, + "epoch": 0.9476342211266758, + "flos": 22820046451200.0, + "grad_norm": 1.6208017326484265, + "language_loss": 0.73423326, + "learning_rate": 2.8636681977659117e-08, + "loss": 0.75615782, + "num_input_tokens_seen": 170323165, + "step": 7881, + "time_per_iteration": 2.778682231903076 + }, + { + "auxiliary_loss_clip": 0.01096978, + "auxiliary_loss_mlp": 0.01083107, + "balance_loss_clip": 1.02297807, + "balance_loss_mlp": 1.00299871, + "epoch": 0.947754464017315, + "flos": 20193611984640.0, + "grad_norm": 2.249687211945673, + "language_loss": 0.7764979, + "learning_rate": 2.850548370498318e-08, + "loss": 0.79829872, + "num_input_tokens_seen": 170341005, + "step": 7882, + "time_per_iteration": 2.820497751235962 + }, + { + "auxiliary_loss_clip": 0.01126299, + "auxiliary_loss_mlp": 0.01083685, + "balance_loss_clip": 1.02550709, + "balance_loss_mlp": 1.00367129, + "epoch": 0.9478747069079541, + "flos": 24717925359360.0, + "grad_norm": 2.4390429639424958, + "language_loss": 0.71296144, + "learning_rate": 2.8374584506457798e-08, + "loss": 0.73506129, + "num_input_tokens_seen": 170362280, + "step": 7883, + "time_per_iteration": 2.7539970874786377 + }, + { + "auxiliary_loss_clip": 0.01113302, + "auxiliary_loss_mlp": 0.01084075, + "balance_loss_clip": 1.02294016, + "balance_loss_mlp": 1.00391912, + "epoch": 0.9479949497985931, + "flos": 21361355136000.0, + "grad_norm": 3.974290730958568, + "language_loss": 0.67553037, + "learning_rate": 2.824398440193998e-08, + "loss": 0.6975041, + "num_input_tokens_seen": 170381080, + "step": 7884, + "time_per_iteration": 2.733086109161377 + }, + { + "auxiliary_loss_clip": 0.01095832, + "auxiliary_loss_mlp": 0.01083674, + "balance_loss_clip": 1.02157259, + "balance_loss_mlp": 1.00342274, + "epoch": 0.9481151926892323, + "flos": 18148606968960.0, + "grad_norm": 1.886412606538585, + "language_loss": 0.71515894, + "learning_rate": 2.811368341124232e-08, + "loss": 0.73695397, + "num_input_tokens_seen": 170400150, + "step": 7885, + "time_per_iteration": 2.7917208671569824 + }, + { + "auxiliary_loss_clip": 0.01120135, + "auxiliary_loss_mlp": 0.01085268, + "balance_loss_clip": 1.02628684, + "balance_loss_mlp": 1.00525451, + "epoch": 0.9482354355798713, + "flos": 22128012046080.0, + "grad_norm": 2.6924202184068013, + "language_loss": 0.68149328, + "learning_rate": 2.7983681554131222e-08, + "loss": 0.7035473, + "num_input_tokens_seen": 170420410, + "step": 7886, + "time_per_iteration": 2.7629811763763428 + }, + { + "auxiliary_loss_clip": 0.01119081, + "auxiliary_loss_mlp": 0.01084164, + "balance_loss_clip": 1.02547324, + "balance_loss_mlp": 1.00391257, + "epoch": 0.9483556784705104, + "flos": 19063072344960.0, + "grad_norm": 2.0224602228216946, + "language_loss": 0.70089811, + "learning_rate": 2.7853978850327365e-08, + "loss": 0.72293055, + "num_input_tokens_seen": 170439580, + "step": 7887, + "time_per_iteration": 2.69091534614563 + }, + { + "auxiliary_loss_clip": 0.01105725, + "auxiliary_loss_mlp": 0.01083143, + "balance_loss_clip": 1.02433228, + "balance_loss_mlp": 1.0031774, + "epoch": 0.9484759213611496, + "flos": 25777110631680.0, + "grad_norm": 1.8610437015091896, + "language_loss": 0.87277031, + "learning_rate": 2.7724575319507225e-08, + "loss": 0.89465892, + "num_input_tokens_seen": 170459290, + "step": 7888, + "time_per_iteration": 2.8500187397003174 + }, + { + "auxiliary_loss_clip": 0.01125579, + "auxiliary_loss_mlp": 0.01084685, + "balance_loss_clip": 1.02437425, + "balance_loss_mlp": 1.00462389, + "epoch": 0.9485961642517886, + "flos": 20667740532480.0, + "grad_norm": 2.2454826419578993, + "language_loss": 0.77221352, + "learning_rate": 2.759547098130044e-08, + "loss": 0.79431617, + "num_input_tokens_seen": 170478020, + "step": 7889, + "time_per_iteration": 2.7032158374786377 + }, + { + "auxiliary_loss_clip": 0.01134223, + "auxiliary_loss_mlp": 0.01083304, + "balance_loss_clip": 1.0248661, + "balance_loss_mlp": 1.00329041, + "epoch": 0.9487164071424277, + "flos": 22674069578880.0, + "grad_norm": 1.7617013156490184, + "language_loss": 0.76800811, + "learning_rate": 2.746666585529267e-08, + "loss": 0.79018337, + "num_input_tokens_seen": 170498295, + "step": 7890, + "time_per_iteration": 2.6731791496276855 + }, + { + "auxiliary_loss_clip": 0.01126531, + "auxiliary_loss_mlp": 0.01084352, + "balance_loss_clip": 1.02545869, + "balance_loss_mlp": 1.00424385, + "epoch": 0.9488366500330668, + "flos": 38726461716480.0, + "grad_norm": 3.132675236520341, + "language_loss": 0.74220335, + "learning_rate": 2.73381599610234e-08, + "loss": 0.76431215, + "num_input_tokens_seen": 170518695, + "step": 7891, + "time_per_iteration": 2.796459197998047 + }, + { + "auxiliary_loss_clip": 0.01125424, + "auxiliary_loss_mlp": 0.01084238, + "balance_loss_clip": 1.02410555, + "balance_loss_mlp": 1.00417686, + "epoch": 0.9489568929237059, + "flos": 27890920149120.0, + "grad_norm": 1.7377128170805218, + "language_loss": 0.71152258, + "learning_rate": 2.7209953317987033e-08, + "loss": 0.73361921, + "num_input_tokens_seen": 170539735, + "step": 7892, + "time_per_iteration": 2.7084591388702393 + }, + { + "auxiliary_loss_clip": 0.01124664, + "auxiliary_loss_mlp": 0.01084209, + "balance_loss_clip": 1.02449524, + "balance_loss_mlp": 1.00410032, + "epoch": 0.9490771358143449, + "flos": 33580642291200.0, + "grad_norm": 14.394329588990677, + "language_loss": 0.7843498, + "learning_rate": 2.7082045945631793e-08, + "loss": 0.80643857, + "num_input_tokens_seen": 170561950, + "step": 7893, + "time_per_iteration": 2.760812759399414 + }, + { + "auxiliary_loss_clip": 0.01105739, + "auxiliary_loss_mlp": 0.01084983, + "balance_loss_clip": 1.02346385, + "balance_loss_mlp": 1.00487399, + "epoch": 0.9491973787049841, + "flos": 14793796512000.0, + "grad_norm": 2.0363174192037374, + "language_loss": 0.69484895, + "learning_rate": 2.6954437863361712e-08, + "loss": 0.71675617, + "num_input_tokens_seen": 170579865, + "step": 7894, + "time_per_iteration": 2.7594645023345947 + }, + { + "auxiliary_loss_clip": 0.01086748, + "auxiliary_loss_mlp": 0.01083073, + "balance_loss_clip": 1.0213877, + "balance_loss_mlp": 1.00310707, + "epoch": 0.9493176215956232, + "flos": 25332535998720.0, + "grad_norm": 1.850315510251535, + "language_loss": 0.70727402, + "learning_rate": 2.6827129090534862e-08, + "loss": 0.72897226, + "num_input_tokens_seen": 170600165, + "step": 7895, + "time_per_iteration": 2.8758833408355713 + }, + { + "auxiliary_loss_clip": 0.01110522, + "auxiliary_loss_mlp": 0.01083807, + "balance_loss_clip": 1.02099967, + "balance_loss_mlp": 1.00365019, + "epoch": 0.9494378644862622, + "flos": 21029971236480.0, + "grad_norm": 2.2943604128698802, + "language_loss": 0.77978206, + "learning_rate": 2.670011964646335e-08, + "loss": 0.80172533, + "num_input_tokens_seen": 170618845, + "step": 7896, + "time_per_iteration": 4.651498794555664 + }, + { + "auxiliary_loss_clip": 0.01082461, + "auxiliary_loss_mlp": 0.0108349, + "balance_loss_clip": 1.02200389, + "balance_loss_mlp": 1.00328624, + "epoch": 0.9495581073769014, + "flos": 15195134148480.0, + "grad_norm": 1.9330680603611623, + "language_loss": 0.68165934, + "learning_rate": 2.657340955041487e-08, + "loss": 0.70331883, + "num_input_tokens_seen": 170637620, + "step": 7897, + "time_per_iteration": 2.851473569869995 + }, + { + "auxiliary_loss_clip": 0.01110103, + "auxiliary_loss_mlp": 0.01083013, + "balance_loss_clip": 1.02108181, + "balance_loss_mlp": 1.00290418, + "epoch": 0.9496783502675404, + "flos": 28616566705920.0, + "grad_norm": 1.9913451959984143, + "language_loss": 0.71569735, + "learning_rate": 2.6446998821611167e-08, + "loss": 0.73762852, + "num_input_tokens_seen": 170657815, + "step": 7898, + "time_per_iteration": 3.745321035385132 + }, + { + "auxiliary_loss_clip": 0.01091732, + "auxiliary_loss_mlp": 0.01084359, + "balance_loss_clip": 1.02265954, + "balance_loss_mlp": 1.0041554, + "epoch": 0.9497985931581795, + "flos": 14866874732160.0, + "grad_norm": 2.147587841816661, + "language_loss": 0.71721292, + "learning_rate": 2.6320887479228228e-08, + "loss": 0.73897386, + "num_input_tokens_seen": 170674415, + "step": 7899, + "time_per_iteration": 2.8064897060394287 + }, + { + "auxiliary_loss_clip": 0.01116255, + "auxiliary_loss_mlp": 0.01084215, + "balance_loss_clip": 1.02403915, + "balance_loss_mlp": 1.00410604, + "epoch": 0.9499188360488187, + "flos": 27193319136000.0, + "grad_norm": 2.3282297582356195, + "language_loss": 0.72537589, + "learning_rate": 2.619507554239786e-08, + "loss": 0.74738061, + "num_input_tokens_seen": 170692975, + "step": 7900, + "time_per_iteration": 3.7554373741149902 + }, + { + "auxiliary_loss_clip": 0.01111465, + "auxiliary_loss_mlp": 0.010844, + "balance_loss_clip": 1.02128792, + "balance_loss_mlp": 1.00414872, + "epoch": 0.9500390789394577, + "flos": 24316479982080.0, + "grad_norm": 1.568144960522553, + "language_loss": 0.6989398, + "learning_rate": 2.606956303020502e-08, + "loss": 0.72089839, + "num_input_tokens_seen": 170713780, + "step": 7901, + "time_per_iteration": 2.816908836364746 + }, + { + "auxiliary_loss_clip": 0.01124233, + "auxiliary_loss_mlp": 0.0108412, + "balance_loss_clip": 1.02426219, + "balance_loss_mlp": 1.00405931, + "epoch": 0.9501593218300968, + "flos": 14354752573440.0, + "grad_norm": 1.766856057908404, + "language_loss": 0.84276068, + "learning_rate": 2.5944349961690036e-08, + "loss": 0.86484426, + "num_input_tokens_seen": 170730800, + "step": 7902, + "time_per_iteration": 2.6888957023620605 + }, + { + "auxiliary_loss_clip": 0.01106056, + "auxiliary_loss_mlp": 0.0108327, + "balance_loss_clip": 1.02356732, + "balance_loss_mlp": 1.00325656, + "epoch": 0.9502795647207359, + "flos": 38728113742080.0, + "grad_norm": 1.568838585253289, + "language_loss": 0.73100746, + "learning_rate": 2.581943635584749e-08, + "loss": 0.75290072, + "num_input_tokens_seen": 170753630, + "step": 7903, + "time_per_iteration": 2.9416897296905518 + }, + { + "auxiliary_loss_clip": 0.01116385, + "auxiliary_loss_mlp": 0.01083811, + "balance_loss_clip": 1.02455676, + "balance_loss_mlp": 1.00379753, + "epoch": 0.950399807611375, + "flos": 40808023799040.0, + "grad_norm": 1.4118688996444766, + "language_loss": 0.64956081, + "learning_rate": 2.569482223162689e-08, + "loss": 0.67156279, + "num_input_tokens_seen": 170777605, + "step": 7904, + "time_per_iteration": 2.8537957668304443 + }, + { + "auxiliary_loss_clip": 0.01124185, + "auxiliary_loss_mlp": 0.01082686, + "balance_loss_clip": 1.02379942, + "balance_loss_mlp": 1.00252962, + "epoch": 0.950520050502014, + "flos": 23440403266560.0, + "grad_norm": 1.8113346678120137, + "language_loss": 0.72473407, + "learning_rate": 2.5570507607932e-08, + "loss": 0.74680281, + "num_input_tokens_seen": 170797520, + "step": 7905, + "time_per_iteration": 2.6971962451934814 + }, + { + "auxiliary_loss_clip": 0.01126909, + "auxiliary_loss_mlp": 0.01084249, + "balance_loss_clip": 1.02569437, + "balance_loss_mlp": 1.00404453, + "epoch": 0.9506402933926532, + "flos": 17783718658560.0, + "grad_norm": 2.2078623530876857, + "language_loss": 0.63760239, + "learning_rate": 2.54464925036213e-08, + "loss": 0.65971392, + "num_input_tokens_seen": 170814810, + "step": 7906, + "time_per_iteration": 2.681528329849243 + }, + { + "auxiliary_loss_clip": 0.01124301, + "auxiliary_loss_mlp": 0.01084768, + "balance_loss_clip": 1.02474189, + "balance_loss_mlp": 1.00470722, + "epoch": 0.9507605362832923, + "flos": 32561928668160.0, + "grad_norm": 1.826447594903772, + "language_loss": 0.6086719, + "learning_rate": 2.532277693750773e-08, + "loss": 0.63076258, + "num_input_tokens_seen": 170835735, + "step": 7907, + "time_per_iteration": 2.7897021770477295 + }, + { + "auxiliary_loss_clip": 0.01095092, + "auxiliary_loss_mlp": 0.01084629, + "balance_loss_clip": 1.0220021, + "balance_loss_mlp": 1.00452054, + "epoch": 0.9508807791739313, + "flos": 19602054898560.0, + "grad_norm": 1.8105452605944008, + "language_loss": 0.75699699, + "learning_rate": 2.5199360928358948e-08, + "loss": 0.77879417, + "num_input_tokens_seen": 170852970, + "step": 7908, + "time_per_iteration": 2.811145305633545 + }, + { + "auxiliary_loss_clip": 0.01126051, + "auxiliary_loss_mlp": 0.00872865, + "balance_loss_clip": 1.02569389, + "balance_loss_mlp": 1.00013018, + "epoch": 0.9510010220645704, + "flos": 21471852349440.0, + "grad_norm": 2.0883024106563894, + "language_loss": 0.8682363, + "learning_rate": 2.507624449489665e-08, + "loss": 0.88822544, + "num_input_tokens_seen": 170871600, + "step": 7909, + "time_per_iteration": 2.638597011566162 + }, + { + "auxiliary_loss_clip": 0.01115277, + "auxiliary_loss_mlp": 0.01083251, + "balance_loss_clip": 1.0240171, + "balance_loss_mlp": 1.00309491, + "epoch": 0.9511212649552095, + "flos": 18879999701760.0, + "grad_norm": 1.7645184730297836, + "language_loss": 0.64872926, + "learning_rate": 2.495342765579811e-08, + "loss": 0.6707145, + "num_input_tokens_seen": 170890260, + "step": 7910, + "time_per_iteration": 2.7339305877685547 + }, + { + "auxiliary_loss_clip": 0.01090972, + "auxiliary_loss_mlp": 0.01084369, + "balance_loss_clip": 1.01934147, + "balance_loss_mlp": 1.00430775, + "epoch": 0.9512415078458486, + "flos": 20810521094400.0, + "grad_norm": 1.5631654790373402, + "language_loss": 0.71178687, + "learning_rate": 2.4830910429693984e-08, + "loss": 0.7335403, + "num_input_tokens_seen": 170910220, + "step": 7911, + "time_per_iteration": 2.8456156253814697 + }, + { + "auxiliary_loss_clip": 0.01134424, + "auxiliary_loss_mlp": 0.01084075, + "balance_loss_clip": 1.02535224, + "balance_loss_mlp": 1.00391889, + "epoch": 0.9513617507364877, + "flos": 18369565482240.0, + "grad_norm": 4.9440774511253895, + "language_loss": 0.79535317, + "learning_rate": 2.470869283517052e-08, + "loss": 0.81753814, + "num_input_tokens_seen": 170928255, + "step": 7912, + "time_per_iteration": 2.6908328533172607 + }, + { + "auxiliary_loss_clip": 0.01126295, + "auxiliary_loss_mlp": 0.01084342, + "balance_loss_clip": 1.02501178, + "balance_loss_mlp": 1.00423384, + "epoch": 0.9514819936271268, + "flos": 25010166412800.0, + "grad_norm": 11.363650306528767, + "language_loss": 0.77017611, + "learning_rate": 2.458677489076777e-08, + "loss": 0.79228246, + "num_input_tokens_seen": 170949265, + "step": 7913, + "time_per_iteration": 2.7107350826263428 + }, + { + "auxiliary_loss_clip": 0.01126973, + "auxiliary_loss_mlp": 0.01084631, + "balance_loss_clip": 1.02576315, + "balance_loss_mlp": 1.00461817, + "epoch": 0.9516022365177659, + "flos": 18662129758080.0, + "grad_norm": 1.573865489230659, + "language_loss": 0.83150923, + "learning_rate": 2.446515661498072e-08, + "loss": 0.8536253, + "num_input_tokens_seen": 170968595, + "step": 7914, + "time_per_iteration": 2.671473503112793 + }, + { + "auxiliary_loss_clip": 0.01083095, + "auxiliary_loss_mlp": 0.010832, + "balance_loss_clip": 1.01855803, + "balance_loss_mlp": 1.0031389, + "epoch": 0.9517224794084049, + "flos": 25372109808000.0, + "grad_norm": 2.1245724462459026, + "language_loss": 0.74137533, + "learning_rate": 2.434383802625861e-08, + "loss": 0.76303828, + "num_input_tokens_seen": 170987550, + "step": 7915, + "time_per_iteration": 2.996828556060791 + }, + { + "auxiliary_loss_clip": 0.01108002, + "auxiliary_loss_mlp": 0.01084646, + "balance_loss_clip": 1.02385557, + "balance_loss_mlp": 1.00458503, + "epoch": 0.9518427222990441, + "flos": 21470918595840.0, + "grad_norm": 1.9567833825679823, + "language_loss": 0.73911917, + "learning_rate": 2.4222819143005168e-08, + "loss": 0.76104569, + "num_input_tokens_seen": 171007145, + "step": 7916, + "time_per_iteration": 2.7447001934051514 + }, + { + "auxiliary_loss_clip": 0.01135286, + "auxiliary_loss_mlp": 0.0108393, + "balance_loss_clip": 1.02643561, + "balance_loss_mlp": 1.00386894, + "epoch": 0.9519629651896832, + "flos": 21033634423680.0, + "grad_norm": 1.707422881427667, + "language_loss": 0.81260473, + "learning_rate": 2.4102099983579706e-08, + "loss": 0.83479691, + "num_input_tokens_seen": 171026295, + "step": 7917, + "time_per_iteration": 2.696772575378418 + }, + { + "auxiliary_loss_clip": 0.01127851, + "auxiliary_loss_mlp": 0.01083612, + "balance_loss_clip": 1.02650213, + "balance_loss_mlp": 1.00345588, + "epoch": 0.9520832080803222, + "flos": 21689219502720.0, + "grad_norm": 1.6374895744991886, + "language_loss": 0.77320027, + "learning_rate": 2.3981680566294236e-08, + "loss": 0.79531485, + "num_input_tokens_seen": 171045895, + "step": 7918, + "time_per_iteration": 2.7327511310577393 + }, + { + "auxiliary_loss_clip": 0.01135466, + "auxiliary_loss_mlp": 0.01083734, + "balance_loss_clip": 1.02648783, + "balance_loss_mlp": 1.00372076, + "epoch": 0.9522034509709614, + "flos": 23145289125120.0, + "grad_norm": 2.4008573412144165, + "language_loss": 0.73685879, + "learning_rate": 2.3861560909416822e-08, + "loss": 0.75905085, + "num_input_tokens_seen": 171065445, + "step": 7919, + "time_per_iteration": 2.6334187984466553 + }, + { + "auxiliary_loss_clip": 0.01080864, + "auxiliary_loss_mlp": 0.0108408, + "balance_loss_clip": 1.02364874, + "balance_loss_mlp": 1.00392377, + "epoch": 0.9523236938616004, + "flos": 24679428958080.0, + "grad_norm": 1.686595393670036, + "language_loss": 0.82635355, + "learning_rate": 2.3741741031169325e-08, + "loss": 0.84800303, + "num_input_tokens_seen": 171085015, + "step": 7920, + "time_per_iteration": 2.8027052879333496 + }, + { + "auxiliary_loss_clip": 0.01094829, + "auxiliary_loss_mlp": 0.01083602, + "balance_loss_clip": 1.02070236, + "balance_loss_mlp": 1.00354099, + "epoch": 0.9524439367522395, + "flos": 22672309812480.0, + "grad_norm": 1.6854971724985117, + "language_loss": 0.71440661, + "learning_rate": 2.3622220949728544e-08, + "loss": 0.73619092, + "num_input_tokens_seen": 171103900, + "step": 7921, + "time_per_iteration": 4.630045175552368 + }, + { + "auxiliary_loss_clip": 0.01125674, + "auxiliary_loss_mlp": 0.01083533, + "balance_loss_clip": 1.02454948, + "balance_loss_mlp": 1.00337684, + "epoch": 0.9525641796428787, + "flos": 34055525024640.0, + "grad_norm": 2.316004181485037, + "language_loss": 0.61605537, + "learning_rate": 2.3503000683225526e-08, + "loss": 0.63814747, + "num_input_tokens_seen": 171121615, + "step": 7922, + "time_per_iteration": 2.7677419185638428 + }, + { + "auxiliary_loss_clip": 0.01133634, + "auxiliary_loss_mlp": 0.01085318, + "balance_loss_clip": 1.02453685, + "balance_loss_mlp": 1.005162, + "epoch": 0.9526844225335177, + "flos": 16727083251840.0, + "grad_norm": 1.8455963867939214, + "language_loss": 0.8451364, + "learning_rate": 2.3384080249745585e-08, + "loss": 0.86732596, + "num_input_tokens_seen": 171139505, + "step": 7923, + "time_per_iteration": 2.582064390182495 + }, + { + "auxiliary_loss_clip": 0.01099798, + "auxiliary_loss_mlp": 0.01084519, + "balance_loss_clip": 1.02417827, + "balance_loss_mlp": 1.00450587, + "epoch": 0.9528046654241568, + "flos": 36939367330560.0, + "grad_norm": 2.1503046120506237, + "language_loss": 0.82667851, + "learning_rate": 2.3265459667329178e-08, + "loss": 0.84852171, + "num_input_tokens_seen": 171158995, + "step": 7924, + "time_per_iteration": 3.8099303245544434 + }, + { + "auxiliary_loss_clip": 0.01114726, + "auxiliary_loss_mlp": 0.01083797, + "balance_loss_clip": 1.02386796, + "balance_loss_mlp": 1.0037359, + "epoch": 0.9529249083147959, + "flos": 18255010032000.0, + "grad_norm": 2.956534566629789, + "language_loss": 0.86168146, + "learning_rate": 2.31471389539708e-08, + "loss": 0.88366663, + "num_input_tokens_seen": 171176120, + "step": 7925, + "time_per_iteration": 3.622443675994873 + }, + { + "auxiliary_loss_clip": 0.01125699, + "auxiliary_loss_mlp": 0.00872751, + "balance_loss_clip": 1.02556539, + "balance_loss_mlp": 1.00012481, + "epoch": 0.953045151205435, + "flos": 28658438985600.0, + "grad_norm": 6.650989834430681, + "language_loss": 0.72816801, + "learning_rate": 2.3029118127619872e-08, + "loss": 0.74815249, + "num_input_tokens_seen": 171195835, + "step": 7926, + "time_per_iteration": 2.7323625087738037 + }, + { + "auxiliary_loss_clip": 0.01116115, + "auxiliary_loss_mlp": 0.0108415, + "balance_loss_clip": 1.02441895, + "balance_loss_mlp": 1.00399339, + "epoch": 0.953165394096074, + "flos": 21835232288640.0, + "grad_norm": 2.048908257629954, + "language_loss": 0.86796558, + "learning_rate": 2.2911397206179628e-08, + "loss": 0.88996816, + "num_input_tokens_seen": 171212585, + "step": 7927, + "time_per_iteration": 2.7199132442474365 + }, + { + "auxiliary_loss_clip": 0.01135388, + "auxiliary_loss_mlp": 0.01084498, + "balance_loss_clip": 1.02652776, + "balance_loss_mlp": 1.00443673, + "epoch": 0.9532856369867132, + "flos": 19975059682560.0, + "grad_norm": 2.725104504315842, + "language_loss": 0.62803888, + "learning_rate": 2.279397620750845e-08, + "loss": 0.6502378, + "num_input_tokens_seen": 171231630, + "step": 7928, + "time_per_iteration": 2.6278257369995117 + }, + { + "auxiliary_loss_clip": 0.01115705, + "auxiliary_loss_mlp": 0.01083278, + "balance_loss_clip": 1.0247972, + "balance_loss_mlp": 1.00321651, + "epoch": 0.9534058798773523, + "flos": 15049588239360.0, + "grad_norm": 1.878103853553167, + "language_loss": 0.78701001, + "learning_rate": 2.2676855149419195e-08, + "loss": 0.80899984, + "num_input_tokens_seen": 171248800, + "step": 7929, + "time_per_iteration": 2.7640604972839355 + }, + { + "auxiliary_loss_clip": 0.01110808, + "auxiliary_loss_mlp": 0.01083597, + "balance_loss_clip": 1.02193999, + "balance_loss_mlp": 1.0034883, + "epoch": 0.9535261227679913, + "flos": 17602800831360.0, + "grad_norm": 2.1116966329711007, + "language_loss": 0.75496656, + "learning_rate": 2.2560034049678988e-08, + "loss": 0.7769106, + "num_input_tokens_seen": 171263150, + "step": 7930, + "time_per_iteration": 2.676701307296753 + }, + { + "auxiliary_loss_clip": 0.01134183, + "auxiliary_loss_mlp": 0.01084626, + "balance_loss_clip": 1.02486229, + "balance_loss_mlp": 1.00442195, + "epoch": 0.9536463656586305, + "flos": 23142954741120.0, + "grad_norm": 1.7460410943666582, + "language_loss": 0.75377786, + "learning_rate": 2.2443512926008988e-08, + "loss": 0.77596593, + "num_input_tokens_seen": 171282480, + "step": 7931, + "time_per_iteration": 2.642807960510254 + }, + { + "auxiliary_loss_clip": 0.01101928, + "auxiliary_loss_mlp": 0.01085332, + "balance_loss_clip": 1.02311659, + "balance_loss_mlp": 1.00522351, + "epoch": 0.9537666085492695, + "flos": 18625033987200.0, + "grad_norm": 5.262794177258839, + "language_loss": 0.6978662, + "learning_rate": 2.2327291796085946e-08, + "loss": 0.71973878, + "num_input_tokens_seen": 171300840, + "step": 7932, + "time_per_iteration": 2.73934268951416 + }, + { + "auxiliary_loss_clip": 0.01134488, + "auxiliary_loss_mlp": 0.01084573, + "balance_loss_clip": 1.02541733, + "balance_loss_mlp": 1.0043695, + "epoch": 0.9538868514399086, + "flos": 18989347680000.0, + "grad_norm": 2.5587321466748354, + "language_loss": 0.77085197, + "learning_rate": 2.2211370677540197e-08, + "loss": 0.79304254, + "num_input_tokens_seen": 171317365, + "step": 7933, + "time_per_iteration": 2.6422359943389893 + }, + { + "auxiliary_loss_clip": 0.01134423, + "auxiliary_loss_mlp": 0.01083954, + "balance_loss_clip": 1.02489448, + "balance_loss_mlp": 1.00389326, + "epoch": 0.9540070943305478, + "flos": 16800556521600.0, + "grad_norm": 2.4290618931213275, + "language_loss": 0.78224438, + "learning_rate": 2.2095749587957012e-08, + "loss": 0.80442816, + "num_input_tokens_seen": 171335270, + "step": 7934, + "time_per_iteration": 2.637543201446533 + }, + { + "auxiliary_loss_clip": 0.01119834, + "auxiliary_loss_mlp": 0.01083372, + "balance_loss_clip": 1.02664733, + "balance_loss_mlp": 1.00326324, + "epoch": 0.9541273372211868, + "flos": 20156911263360.0, + "grad_norm": 1.775576427247813, + "language_loss": 0.69587398, + "learning_rate": 2.1980428544876138e-08, + "loss": 0.71790606, + "num_input_tokens_seen": 171353910, + "step": 7935, + "time_per_iteration": 2.7126269340515137 + }, + { + "auxiliary_loss_clip": 0.01101904, + "auxiliary_loss_mlp": 0.01083765, + "balance_loss_clip": 1.02422619, + "balance_loss_mlp": 1.00370383, + "epoch": 0.9542475801118259, + "flos": 26725511381760.0, + "grad_norm": 1.4536811519651145, + "language_loss": 0.7397871, + "learning_rate": 2.1865407565791584e-08, + "loss": 0.76164383, + "num_input_tokens_seen": 171375480, + "step": 7936, + "time_per_iteration": 2.8574843406677246 + }, + { + "auxiliary_loss_clip": 0.01115899, + "auxiliary_loss_mlp": 0.01084345, + "balance_loss_clip": 1.02389383, + "balance_loss_mlp": 1.00423658, + "epoch": 0.954367823002465, + "flos": 23330911633920.0, + "grad_norm": 1.9301151915349652, + "language_loss": 0.77019286, + "learning_rate": 2.175068666815183e-08, + "loss": 0.79219532, + "num_input_tokens_seen": 171396320, + "step": 7937, + "time_per_iteration": 2.714670419692993 + }, + { + "auxiliary_loss_clip": 0.01109796, + "auxiliary_loss_mlp": 0.01083973, + "balance_loss_clip": 1.0253619, + "balance_loss_mlp": 1.00386405, + "epoch": 0.9544880658931041, + "flos": 14902713527040.0, + "grad_norm": 2.014830378250718, + "language_loss": 0.78818703, + "learning_rate": 2.163626586935985e-08, + "loss": 0.81012464, + "num_input_tokens_seen": 171412860, + "step": 7938, + "time_per_iteration": 2.7876622676849365 + }, + { + "auxiliary_loss_clip": 0.01125149, + "auxiliary_loss_mlp": 0.01084999, + "balance_loss_clip": 1.02365756, + "balance_loss_mlp": 1.00484228, + "epoch": 0.9546083087837431, + "flos": 29095902725760.0, + "grad_norm": 1.7426424863448355, + "language_loss": 0.62968057, + "learning_rate": 2.1522145186773755e-08, + "loss": 0.65178204, + "num_input_tokens_seen": 171431780, + "step": 7939, + "time_per_iteration": 2.719956159591675 + }, + { + "auxiliary_loss_clip": 0.01115841, + "auxiliary_loss_mlp": 0.01084805, + "balance_loss_clip": 1.02446187, + "balance_loss_mlp": 1.0047915, + "epoch": 0.9547285516743822, + "flos": 21142335957120.0, + "grad_norm": 1.6082099161567602, + "language_loss": 0.85609174, + "learning_rate": 2.140832463770481e-08, + "loss": 0.87809819, + "num_input_tokens_seen": 171450975, + "step": 7940, + "time_per_iteration": 2.775719404220581 + }, + { + "auxiliary_loss_clip": 0.01117387, + "auxiliary_loss_mlp": 0.01084443, + "balance_loss_clip": 1.02410734, + "balance_loss_mlp": 1.00423908, + "epoch": 0.9548487945650214, + "flos": 27490157130240.0, + "grad_norm": 2.6091018630929463, + "language_loss": 0.76295912, + "learning_rate": 2.129480423941987e-08, + "loss": 0.78497744, + "num_input_tokens_seen": 171467645, + "step": 7941, + "time_per_iteration": 2.7672557830810547 + }, + { + "auxiliary_loss_clip": 0.01101677, + "auxiliary_loss_mlp": 0.01083535, + "balance_loss_clip": 1.0265069, + "balance_loss_mlp": 1.00352132, + "epoch": 0.9549690374556604, + "flos": 22273198819200.0, + "grad_norm": 1.606177169483802, + "language_loss": 0.801319, + "learning_rate": 2.1181584009140052e-08, + "loss": 0.82317114, + "num_input_tokens_seen": 171487185, + "step": 7942, + "time_per_iteration": 2.754340171813965 + }, + { + "auxiliary_loss_clip": 0.01108697, + "auxiliary_loss_mlp": 0.01083818, + "balance_loss_clip": 1.02398276, + "balance_loss_mlp": 1.00380445, + "epoch": 0.9550892803462995, + "flos": 17595294888960.0, + "grad_norm": 2.4139783865069813, + "language_loss": 0.83771074, + "learning_rate": 2.10686639640405e-08, + "loss": 0.85963595, + "num_input_tokens_seen": 171501275, + "step": 7943, + "time_per_iteration": 2.699758529663086 + }, + { + "auxiliary_loss_clip": 0.0111021, + "auxiliary_loss_mlp": 0.01082872, + "balance_loss_clip": 1.02579331, + "balance_loss_mlp": 1.0029062, + "epoch": 0.9552095232369386, + "flos": 24353144789760.0, + "grad_norm": 1.5639701815805562, + "language_loss": 0.81235892, + "learning_rate": 2.0956044121251294e-08, + "loss": 0.83428973, + "num_input_tokens_seen": 171520060, + "step": 7944, + "time_per_iteration": 2.6812098026275635 + }, + { + "auxiliary_loss_clip": 0.01099158, + "auxiliary_loss_mlp": 0.01084924, + "balance_loss_clip": 1.01956272, + "balance_loss_mlp": 1.0048635, + "epoch": 0.9553297661275777, + "flos": 22746860490240.0, + "grad_norm": 1.7803774019778273, + "language_loss": 0.80709147, + "learning_rate": 2.084372449785654e-08, + "loss": 0.82893229, + "num_input_tokens_seen": 171539895, + "step": 7945, + "time_per_iteration": 2.839388608932495 + }, + { + "auxiliary_loss_clip": 0.01118234, + "auxiliary_loss_mlp": 0.01083779, + "balance_loss_clip": 1.02541244, + "balance_loss_mlp": 1.00371838, + "epoch": 0.9554500090182168, + "flos": 15413866018560.0, + "grad_norm": 2.103077464390286, + "language_loss": 0.6873548, + "learning_rate": 2.0731705110895282e-08, + "loss": 0.70937496, + "num_input_tokens_seen": 171557385, + "step": 7946, + "time_per_iteration": 3.684752941131592 + }, + { + "auxiliary_loss_clip": 0.01125613, + "auxiliary_loss_mlp": 0.01085181, + "balance_loss_clip": 1.02528608, + "balance_loss_mlp": 1.00488138, + "epoch": 0.9555702519088559, + "flos": 23513517400320.0, + "grad_norm": 1.6293424696884302, + "language_loss": 0.86995798, + "learning_rate": 2.0619985977360587e-08, + "loss": 0.89206588, + "num_input_tokens_seen": 171575705, + "step": 7947, + "time_per_iteration": 3.63539457321167 + }, + { + "auxiliary_loss_clip": 0.01093716, + "auxiliary_loss_mlp": 0.01083538, + "balance_loss_clip": 1.02333426, + "balance_loss_mlp": 1.00357246, + "epoch": 0.955690494799495, + "flos": 22962072827520.0, + "grad_norm": 1.6038623822158804, + "language_loss": 0.76953804, + "learning_rate": 2.0508567114200237e-08, + "loss": 0.79131067, + "num_input_tokens_seen": 171595620, + "step": 7948, + "time_per_iteration": 2.8146212100982666 + }, + { + "auxiliary_loss_clip": 0.01117548, + "auxiliary_loss_mlp": 0.01083978, + "balance_loss_clip": 1.02482152, + "balance_loss_mlp": 1.00401258, + "epoch": 0.955810737690134, + "flos": 26031250333440.0, + "grad_norm": 1.9447835806934577, + "language_loss": 0.78737968, + "learning_rate": 2.0397448538316485e-08, + "loss": 0.80939496, + "num_input_tokens_seen": 171616660, + "step": 7949, + "time_per_iteration": 2.6911909580230713 + }, + { + "auxiliary_loss_clip": 0.01108538, + "auxiliary_loss_mlp": 0.01083045, + "balance_loss_clip": 1.02469289, + "balance_loss_mlp": 1.00312662, + "epoch": 0.9559309805807732, + "flos": 20849951249280.0, + "grad_norm": 1.890956724746981, + "language_loss": 0.66462481, + "learning_rate": 2.028663026656563e-08, + "loss": 0.6865406, + "num_input_tokens_seen": 171635515, + "step": 7950, + "time_per_iteration": 4.530627965927124 + }, + { + "auxiliary_loss_clip": 0.01134846, + "auxiliary_loss_mlp": 0.00872842, + "balance_loss_clip": 1.02610588, + "balance_loss_mlp": 1.00010061, + "epoch": 0.9560512234714122, + "flos": 21578219498880.0, + "grad_norm": 1.8422375297511093, + "language_loss": 0.71859413, + "learning_rate": 2.0176112315758885e-08, + "loss": 0.73867106, + "num_input_tokens_seen": 171653305, + "step": 7951, + "time_per_iteration": 2.644468069076538 + }, + { + "auxiliary_loss_clip": 0.01097323, + "auxiliary_loss_mlp": 0.01083717, + "balance_loss_clip": 1.02319014, + "balance_loss_mlp": 1.0036087, + "epoch": 0.9561714663620513, + "flos": 17450144029440.0, + "grad_norm": 2.1868827957575108, + "language_loss": 0.69265366, + "learning_rate": 2.0065894702661957e-08, + "loss": 0.71446401, + "num_input_tokens_seen": 171669980, + "step": 7952, + "time_per_iteration": 2.7386696338653564 + }, + { + "auxiliary_loss_clip": 0.01107591, + "auxiliary_loss_mlp": 0.00872902, + "balance_loss_clip": 1.02351213, + "balance_loss_mlp": 1.00006807, + "epoch": 0.9562917092526905, + "flos": 26098510550400.0, + "grad_norm": 1.5345428893414188, + "language_loss": 0.77958423, + "learning_rate": 1.9955977443994577e-08, + "loss": 0.79938918, + "num_input_tokens_seen": 171689970, + "step": 7953, + "time_per_iteration": 2.8115289211273193 + }, + { + "auxiliary_loss_clip": 0.0111801, + "auxiliary_loss_mlp": 0.01083732, + "balance_loss_clip": 1.02516627, + "balance_loss_mlp": 1.00343251, + "epoch": 0.9564119521433295, + "flos": 24096742531200.0, + "grad_norm": 2.1653691823830052, + "language_loss": 0.62149155, + "learning_rate": 1.9846360556430965e-08, + "loss": 0.64350897, + "num_input_tokens_seen": 171708270, + "step": 7954, + "time_per_iteration": 2.70133638381958 + }, + { + "auxiliary_loss_clip": 0.0113421, + "auxiliary_loss_mlp": 0.01083654, + "balance_loss_clip": 1.02541935, + "balance_loss_mlp": 1.00349808, + "epoch": 0.9565321950339686, + "flos": 32008903896960.0, + "grad_norm": 3.5945448564012357, + "language_loss": 0.61202419, + "learning_rate": 1.973704405660004e-08, + "loss": 0.63420278, + "num_input_tokens_seen": 171729385, + "step": 7955, + "time_per_iteration": 2.7663466930389404 + }, + { + "auxiliary_loss_clip": 0.01078498, + "auxiliary_loss_mlp": 0.01083698, + "balance_loss_clip": 1.01996446, + "balance_loss_mlp": 1.00363755, + "epoch": 0.9566524379246077, + "flos": 23588642695680.0, + "grad_norm": 2.567429741324164, + "language_loss": 0.78141773, + "learning_rate": 1.9628027961085203e-08, + "loss": 0.80303973, + "num_input_tokens_seen": 171752615, + "step": 7956, + "time_per_iteration": 2.8878746032714844 + }, + { + "auxiliary_loss_clip": 0.01107199, + "auxiliary_loss_mlp": 0.01083842, + "balance_loss_clip": 1.02310371, + "balance_loss_mlp": 1.0037334, + "epoch": 0.9567726808152468, + "flos": 38067716240640.0, + "grad_norm": 2.0233550320413776, + "language_loss": 0.84033465, + "learning_rate": 1.9519312286423894e-08, + "loss": 0.86224502, + "num_input_tokens_seen": 171775810, + "step": 7957, + "time_per_iteration": 2.8631134033203125 + }, + { + "auxiliary_loss_clip": 0.01125263, + "auxiliary_loss_mlp": 0.01084611, + "balance_loss_clip": 1.02579999, + "balance_loss_mlp": 1.00450265, + "epoch": 0.9568929237058859, + "flos": 22744059229440.0, + "grad_norm": 1.5920494126327218, + "language_loss": 0.77823472, + "learning_rate": 1.9410897049108255e-08, + "loss": 0.8003335, + "num_input_tokens_seen": 171795090, + "step": 7958, + "time_per_iteration": 2.836228609085083 + }, + { + "auxiliary_loss_clip": 0.01137467, + "auxiliary_loss_mlp": 0.01083877, + "balance_loss_clip": 1.02732205, + "balance_loss_mlp": 1.00376821, + "epoch": 0.957013166596525, + "flos": 23841633162240.0, + "grad_norm": 1.7447643554136398, + "language_loss": 0.91218281, + "learning_rate": 1.9302782265584905e-08, + "loss": 0.93439615, + "num_input_tokens_seen": 171815755, + "step": 7959, + "time_per_iteration": 2.658449172973633 + }, + { + "auxiliary_loss_clip": 0.01091027, + "auxiliary_loss_mlp": 0.01084767, + "balance_loss_clip": 1.01950026, + "balance_loss_mlp": 1.00475311, + "epoch": 0.9571334094871641, + "flos": 17639286071040.0, + "grad_norm": 2.1829666554110134, + "language_loss": 0.86757815, + "learning_rate": 1.9194967952254282e-08, + "loss": 0.88933611, + "num_input_tokens_seen": 171834330, + "step": 7960, + "time_per_iteration": 2.845075845718384 + }, + { + "auxiliary_loss_clip": 0.01124571, + "auxiliary_loss_mlp": 0.01084432, + "balance_loss_clip": 1.02560997, + "balance_loss_mlp": 1.00437164, + "epoch": 0.9572536523778031, + "flos": 15369623441280.0, + "grad_norm": 2.1378372677679187, + "language_loss": 0.80385411, + "learning_rate": 1.9087454125472635e-08, + "loss": 0.82594419, + "num_input_tokens_seen": 171848805, + "step": 7961, + "time_per_iteration": 2.681011915206909 + }, + { + "auxiliary_loss_clip": 0.01134956, + "auxiliary_loss_mlp": 0.01083641, + "balance_loss_clip": 1.02593255, + "balance_loss_mlp": 1.00357997, + "epoch": 0.9573738952684423, + "flos": 24969838417920.0, + "grad_norm": 1.7858699627347565, + "language_loss": 0.7821607, + "learning_rate": 1.8980240801548696e-08, + "loss": 0.80434668, + "num_input_tokens_seen": 171867995, + "step": 7962, + "time_per_iteration": 2.700101852416992 + }, + { + "auxiliary_loss_clip": 0.01115672, + "auxiliary_loss_mlp": 0.01083613, + "balance_loss_clip": 1.02502799, + "balance_loss_mlp": 1.00355184, + "epoch": 0.9574941381590814, + "flos": 25769461034880.0, + "grad_norm": 1.7528549588440128, + "language_loss": 0.74112976, + "learning_rate": 1.8873327996747458e-08, + "loss": 0.76312256, + "num_input_tokens_seen": 171886495, + "step": 7963, + "time_per_iteration": 2.747384786605835 + }, + { + "auxiliary_loss_clip": 0.01126331, + "auxiliary_loss_mlp": 0.01083196, + "balance_loss_clip": 1.02493262, + "balance_loss_mlp": 1.00313485, + "epoch": 0.9576143810497204, + "flos": 32307178435200.0, + "grad_norm": 1.8250876968639889, + "language_loss": 0.65891099, + "learning_rate": 1.8766715727287053e-08, + "loss": 0.68100625, + "num_input_tokens_seen": 171908200, + "step": 7964, + "time_per_iteration": 2.756854772567749 + }, + { + "auxiliary_loss_clip": 0.01126097, + "auxiliary_loss_mlp": 0.00872949, + "balance_loss_clip": 1.02482677, + "balance_loss_mlp": 1.00010514, + "epoch": 0.9577346239403596, + "flos": 27745733376000.0, + "grad_norm": 1.9488735766006, + "language_loss": 0.79461706, + "learning_rate": 1.8660404009340546e-08, + "loss": 0.8146075, + "num_input_tokens_seen": 171928650, + "step": 7965, + "time_per_iteration": 2.677962303161621 + }, + { + "auxiliary_loss_clip": 0.01104796, + "auxiliary_loss_mlp": 0.01079022, + "balance_loss_clip": 1.01742804, + "balance_loss_mlp": 1.00005817, + "epoch": 0.9578548668309986, + "flos": 57468313710720.0, + "grad_norm": 0.881885701428142, + "language_loss": 0.59504384, + "learning_rate": 1.8554392859035485e-08, + "loss": 0.61688203, + "num_input_tokens_seen": 171986400, + "step": 7966, + "time_per_iteration": 3.2371551990509033 + }, + { + "auxiliary_loss_clip": 0.01070895, + "auxiliary_loss_mlp": 0.01083678, + "balance_loss_clip": 1.0198462, + "balance_loss_mlp": 1.00352192, + "epoch": 0.9579751097216377, + "flos": 19756040503680.0, + "grad_norm": 1.6322483931122405, + "language_loss": 0.78933787, + "learning_rate": 1.8448682292453444e-08, + "loss": 0.81088358, + "num_input_tokens_seen": 172005475, + "step": 7967, + "time_per_iteration": 2.9543869495391846 + }, + { + "auxiliary_loss_clip": 0.01137133, + "auxiliary_loss_mlp": 0.01084378, + "balance_loss_clip": 1.02772903, + "balance_loss_mlp": 1.00426972, + "epoch": 0.9580953526122769, + "flos": 18041270152320.0, + "grad_norm": 1.7895767703795247, + "language_loss": 0.66302729, + "learning_rate": 1.8343272325631154e-08, + "loss": 0.68524241, + "num_input_tokens_seen": 172024420, + "step": 7968, + "time_per_iteration": 2.6426870822906494 + }, + { + "auxiliary_loss_clip": 0.01072825, + "auxiliary_loss_mlp": 0.00872896, + "balance_loss_clip": 1.01808238, + "balance_loss_mlp": 1.00009799, + "epoch": 0.9582155955029159, + "flos": 24270154416000.0, + "grad_norm": 2.1078885118415474, + "language_loss": 0.78163719, + "learning_rate": 1.8238162974558492e-08, + "loss": 0.80109447, + "num_input_tokens_seen": 172038350, + "step": 7969, + "time_per_iteration": 3.058736801147461 + }, + { + "auxiliary_loss_clip": 0.01115098, + "auxiliary_loss_mlp": 0.01085512, + "balance_loss_clip": 1.0238266, + "balance_loss_mlp": 1.00545132, + "epoch": 0.958335838393555, + "flos": 22783309816320.0, + "grad_norm": 1.8658378786268934, + "language_loss": 0.74492431, + "learning_rate": 1.8133354255181144e-08, + "loss": 0.76693046, + "num_input_tokens_seen": 172058665, + "step": 7970, + "time_per_iteration": 2.768169403076172 + }, + { + "auxiliary_loss_clip": 0.0112625, + "auxiliary_loss_mlp": 0.01084145, + "balance_loss_clip": 1.02476907, + "balance_loss_mlp": 1.00408435, + "epoch": 0.958456081284194, + "flos": 16911484698240.0, + "grad_norm": 1.7698085539692203, + "language_loss": 0.74402046, + "learning_rate": 1.802884618339795e-08, + "loss": 0.76612437, + "num_input_tokens_seen": 172077470, + "step": 7971, + "time_per_iteration": 3.694786787033081 + }, + { + "auxiliary_loss_clip": 0.01124995, + "auxiliary_loss_mlp": 0.01084801, + "balance_loss_clip": 1.0250349, + "balance_loss_mlp": 1.00469196, + "epoch": 0.9585763241748332, + "flos": 19974951941760.0, + "grad_norm": 1.9313507201694076, + "language_loss": 0.81243515, + "learning_rate": 1.7924638775062894e-08, + "loss": 0.8345331, + "num_input_tokens_seen": 172096590, + "step": 7972, + "time_per_iteration": 3.5522332191467285 + }, + { + "auxiliary_loss_clip": 0.01103503, + "auxiliary_loss_mlp": 0.01083848, + "balance_loss_clip": 1.02230144, + "balance_loss_mlp": 1.0038352, + "epoch": 0.9586965670654722, + "flos": 21395649646080.0, + "grad_norm": 1.9961827161983687, + "language_loss": 0.81538808, + "learning_rate": 1.7820732045984444e-08, + "loss": 0.83726156, + "num_input_tokens_seen": 172116735, + "step": 7973, + "time_per_iteration": 2.8220767974853516 + }, + { + "auxiliary_loss_clip": 0.01127444, + "auxiliary_loss_mlp": 0.01083903, + "balance_loss_clip": 1.0260011, + "balance_loss_mlp": 1.00374675, + "epoch": 0.9588168099561113, + "flos": 21435115714560.0, + "grad_norm": 1.9168167239801583, + "language_loss": 0.73806667, + "learning_rate": 1.7717126011924655e-08, + "loss": 0.76018018, + "num_input_tokens_seen": 172138320, + "step": 7974, + "time_per_iteration": 2.8058457374572754 + }, + { + "auxiliary_loss_clip": 0.01099936, + "auxiliary_loss_mlp": 0.01085412, + "balance_loss_clip": 1.0235486, + "balance_loss_mlp": 1.00535107, + "epoch": 0.9589370528467505, + "flos": 11763761852160.0, + "grad_norm": 2.2960050257855547, + "language_loss": 0.76150542, + "learning_rate": 1.7613820688600957e-08, + "loss": 0.78335893, + "num_input_tokens_seen": 172154225, + "step": 7975, + "time_per_iteration": 3.683960437774658 + }, + { + "auxiliary_loss_clip": 0.01102292, + "auxiliary_loss_mlp": 0.01084047, + "balance_loss_clip": 1.02572608, + "balance_loss_mlp": 1.00389075, + "epoch": 0.9590572957373895, + "flos": 23441516588160.0, + "grad_norm": 1.6877968418575506, + "language_loss": 0.78273851, + "learning_rate": 1.7510816091684588e-08, + "loss": 0.80460191, + "num_input_tokens_seen": 172174150, + "step": 7976, + "time_per_iteration": 3.6495344638824463 + }, + { + "auxiliary_loss_clip": 0.01114692, + "auxiliary_loss_mlp": 0.01083699, + "balance_loss_clip": 1.02427006, + "balance_loss_mlp": 1.0034951, + "epoch": 0.9591775386280286, + "flos": 22528272274560.0, + "grad_norm": 4.821915701375727, + "language_loss": 0.78837276, + "learning_rate": 1.740811223680083e-08, + "loss": 0.81035674, + "num_input_tokens_seen": 172191005, + "step": 7977, + "time_per_iteration": 2.7160773277282715 + }, + { + "auxiliary_loss_clip": 0.01133573, + "auxiliary_loss_mlp": 0.01083824, + "balance_loss_clip": 1.02479386, + "balance_loss_mlp": 1.00376356, + "epoch": 0.9592977815186677, + "flos": 18186959715840.0, + "grad_norm": 2.2637189526893002, + "language_loss": 0.74195766, + "learning_rate": 1.7305709139530334e-08, + "loss": 0.76413161, + "num_input_tokens_seen": 172209785, + "step": 7978, + "time_per_iteration": 2.576678514480591 + }, + { + "auxiliary_loss_clip": 0.01126118, + "auxiliary_loss_mlp": 0.01085431, + "balance_loss_clip": 1.02446604, + "balance_loss_mlp": 1.00532269, + "epoch": 0.9594180244093068, + "flos": 16537797555840.0, + "grad_norm": 2.6148270595967715, + "language_loss": 0.74704146, + "learning_rate": 1.7203606815407334e-08, + "loss": 0.76915699, + "num_input_tokens_seen": 172224380, + "step": 7979, + "time_per_iteration": 2.682800531387329 + }, + { + "auxiliary_loss_clip": 0.01117466, + "auxiliary_loss_mlp": 0.01086355, + "balance_loss_clip": 1.02545249, + "balance_loss_mlp": 1.00619841, + "epoch": 0.9595382672999458, + "flos": 20554334317440.0, + "grad_norm": 1.719416908144737, + "language_loss": 0.79503721, + "learning_rate": 1.7101805279920557e-08, + "loss": 0.81707543, + "num_input_tokens_seen": 172242540, + "step": 7980, + "time_per_iteration": 2.699913501739502 + }, + { + "auxiliary_loss_clip": 0.01135597, + "auxiliary_loss_mlp": 0.01083592, + "balance_loss_clip": 1.02660537, + "balance_loss_mlp": 1.0034833, + "epoch": 0.959658510190585, + "flos": 22638266697600.0, + "grad_norm": 2.1444834860197517, + "language_loss": 0.81173629, + "learning_rate": 1.7000304548513643e-08, + "loss": 0.83392817, + "num_input_tokens_seen": 172262645, + "step": 7981, + "time_per_iteration": 2.6238858699798584 + }, + { + "auxiliary_loss_clip": 0.01105072, + "auxiliary_loss_mlp": 0.01085263, + "balance_loss_clip": 1.02223563, + "balance_loss_mlp": 1.00510645, + "epoch": 0.9597787530812241, + "flos": 19135252725120.0, + "grad_norm": 2.0503749409925343, + "language_loss": 0.83002353, + "learning_rate": 1.6899104636583394e-08, + "loss": 0.85192686, + "num_input_tokens_seen": 172280695, + "step": 7982, + "time_per_iteration": 2.7160239219665527 + }, + { + "auxiliary_loss_clip": 0.01104715, + "auxiliary_loss_mlp": 0.01078734, + "balance_loss_clip": 1.01739478, + "balance_loss_mlp": 0.99976981, + "epoch": 0.9598989959718631, + "flos": 60098124055680.0, + "grad_norm": 0.7209324941468408, + "language_loss": 0.61952353, + "learning_rate": 1.6798205559482638e-08, + "loss": 0.64135802, + "num_input_tokens_seen": 172343075, + "step": 7983, + "time_per_iteration": 3.467604637145996 + }, + { + "auxiliary_loss_clip": 0.01102607, + "auxiliary_loss_mlp": 0.01083737, + "balance_loss_clip": 1.02067912, + "balance_loss_mlp": 1.00362873, + "epoch": 0.9600192388625023, + "flos": 20886795624960.0, + "grad_norm": 1.7951161213031546, + "language_loss": 0.76688337, + "learning_rate": 1.669760733251713e-08, + "loss": 0.78874677, + "num_input_tokens_seen": 172361950, + "step": 7984, + "time_per_iteration": 2.8477282524108887 + }, + { + "auxiliary_loss_clip": 0.0107091, + "auxiliary_loss_mlp": 0.01083395, + "balance_loss_clip": 1.02201557, + "balance_loss_mlp": 1.00347674, + "epoch": 0.9601394817531413, + "flos": 20445740524800.0, + "grad_norm": 2.055209829456145, + "language_loss": 0.82083213, + "learning_rate": 1.659730997094755e-08, + "loss": 0.84237522, + "num_input_tokens_seen": 172380440, + "step": 7985, + "time_per_iteration": 2.9180757999420166 + }, + { + "auxiliary_loss_clip": 0.01125454, + "auxiliary_loss_mlp": 0.01083607, + "balance_loss_clip": 1.0248239, + "balance_loss_mlp": 1.00349844, + "epoch": 0.9602597246437804, + "flos": 21507152440320.0, + "grad_norm": 1.9694809224009033, + "language_loss": 0.62078756, + "learning_rate": 1.6497313489989283e-08, + "loss": 0.64287817, + "num_input_tokens_seen": 172400265, + "step": 7986, + "time_per_iteration": 2.7404348850250244 + }, + { + "auxiliary_loss_clip": 0.01100553, + "auxiliary_loss_mlp": 0.01083633, + "balance_loss_clip": 1.02297866, + "balance_loss_mlp": 1.00352478, + "epoch": 0.9603799675344196, + "flos": 29935099152000.0, + "grad_norm": 2.1414586988139463, + "language_loss": 0.69946659, + "learning_rate": 1.639761790481131e-08, + "loss": 0.72130847, + "num_input_tokens_seen": 172421145, + "step": 7987, + "time_per_iteration": 2.9118740558624268 + }, + { + "auxiliary_loss_clip": 0.0111067, + "auxiliary_loss_mlp": 0.01084123, + "balance_loss_clip": 1.02649236, + "balance_loss_mlp": 1.00401437, + "epoch": 0.9605002104250586, + "flos": 28001525103360.0, + "grad_norm": 1.9441167462941538, + "language_loss": 0.79029346, + "learning_rate": 1.6298223230537754e-08, + "loss": 0.81224144, + "num_input_tokens_seen": 172438945, + "step": 7988, + "time_per_iteration": 2.7277660369873047 + }, + { + "auxiliary_loss_clip": 0.01108065, + "auxiliary_loss_mlp": 0.00872947, + "balance_loss_clip": 1.02241468, + "balance_loss_mlp": 1.00006962, + "epoch": 0.9606204533156977, + "flos": 35590490870400.0, + "grad_norm": 2.8386588031567848, + "language_loss": 0.694884, + "learning_rate": 1.619912948224611e-08, + "loss": 0.71469414, + "num_input_tokens_seen": 172460150, + "step": 7989, + "time_per_iteration": 2.842398166656494 + }, + { + "auxiliary_loss_clip": 0.01107975, + "auxiliary_loss_mlp": 0.01083581, + "balance_loss_clip": 1.02406383, + "balance_loss_mlp": 1.00347209, + "epoch": 0.9607406962063368, + "flos": 26574614346240.0, + "grad_norm": 2.2088330905201476, + "language_loss": 0.60779977, + "learning_rate": 1.6100336674969682e-08, + "loss": 0.62971526, + "num_input_tokens_seen": 172478990, + "step": 7990, + "time_per_iteration": 2.836453437805176 + }, + { + "auxiliary_loss_clip": 0.01083054, + "auxiliary_loss_mlp": 0.01085645, + "balance_loss_clip": 1.02422261, + "balance_loss_mlp": 1.00544095, + "epoch": 0.9608609390969759, + "flos": 25331781813120.0, + "grad_norm": 1.81417368974954, + "language_loss": 0.76562697, + "learning_rate": 1.600184482369449e-08, + "loss": 0.78731394, + "num_input_tokens_seen": 172498905, + "step": 7991, + "time_per_iteration": 2.852581024169922 + }, + { + "auxiliary_loss_clip": 0.01106883, + "auxiliary_loss_mlp": 0.01083974, + "balance_loss_clip": 1.02272916, + "balance_loss_mlp": 1.00386584, + "epoch": 0.960981181987615, + "flos": 21069114082560.0, + "grad_norm": 3.791376065559093, + "language_loss": 0.8897348, + "learning_rate": 1.5903653943362126e-08, + "loss": 0.91164339, + "num_input_tokens_seen": 172517900, + "step": 7992, + "time_per_iteration": 2.7414281368255615 + }, + { + "auxiliary_loss_clip": 0.01115238, + "auxiliary_loss_mlp": 0.01083037, + "balance_loss_clip": 1.0238781, + "balance_loss_mlp": 1.00302362, + "epoch": 0.9611014248782541, + "flos": 17823256554240.0, + "grad_norm": 1.7139920536376436, + "language_loss": 0.76819021, + "learning_rate": 1.580576404886802e-08, + "loss": 0.79017293, + "num_input_tokens_seen": 172536430, + "step": 7993, + "time_per_iteration": 2.715489387512207 + }, + { + "auxiliary_loss_clip": 0.01126852, + "auxiliary_loss_mlp": 0.01083581, + "balance_loss_clip": 1.02647841, + "balance_loss_mlp": 1.00347197, + "epoch": 0.9612216677688932, + "flos": 19354631040000.0, + "grad_norm": 2.158396332006721, + "language_loss": 0.79700053, + "learning_rate": 1.570817515506162e-08, + "loss": 0.81910485, + "num_input_tokens_seen": 172555120, + "step": 7994, + "time_per_iteration": 2.6630859375 + }, + { + "auxiliary_loss_clip": 0.01134385, + "auxiliary_loss_mlp": 0.01084039, + "balance_loss_clip": 1.02564216, + "balance_loss_mlp": 1.00402594, + "epoch": 0.9613419106595322, + "flos": 15808739207040.0, + "grad_norm": 1.8665395226246841, + "language_loss": 0.81380743, + "learning_rate": 1.561088727674753e-08, + "loss": 0.83599162, + "num_input_tokens_seen": 172569330, + "step": 7995, + "time_per_iteration": 2.607666254043579 + }, + { + "auxiliary_loss_clip": 0.0110017, + "auxiliary_loss_mlp": 0.01084871, + "balance_loss_clip": 1.02373791, + "balance_loss_mlp": 1.00457191, + "epoch": 0.9614621535501714, + "flos": 25702488126720.0, + "grad_norm": 2.1056718764288602, + "language_loss": 0.71177942, + "learning_rate": 1.551390042868417e-08, + "loss": 0.73362982, + "num_input_tokens_seen": 172591100, + "step": 7996, + "time_per_iteration": 2.8075122833251953 + }, + { + "auxiliary_loss_clip": 0.01125962, + "auxiliary_loss_mlp": 0.01083486, + "balance_loss_clip": 1.02533937, + "balance_loss_mlp": 1.00347257, + "epoch": 0.9615823964408104, + "flos": 17819054663040.0, + "grad_norm": 1.7629248167300617, + "language_loss": 0.70533562, + "learning_rate": 1.5417214625584207e-08, + "loss": 0.72743011, + "num_input_tokens_seen": 172608755, + "step": 7997, + "time_per_iteration": 4.4751060009002686 + }, + { + "auxiliary_loss_clip": 0.01126988, + "auxiliary_loss_mlp": 0.010836, + "balance_loss_clip": 1.02574658, + "balance_loss_mlp": 1.00344324, + "epoch": 0.9617026393314495, + "flos": 20190020624640.0, + "grad_norm": 1.7992463828031693, + "language_loss": 0.85054117, + "learning_rate": 1.5320829882114806e-08, + "loss": 0.87264699, + "num_input_tokens_seen": 172626830, + "step": 7998, + "time_per_iteration": 2.6989123821258545 + }, + { + "auxiliary_loss_clip": 0.01134302, + "auxiliary_loss_mlp": 0.01083927, + "balance_loss_clip": 1.02482176, + "balance_loss_mlp": 1.00377071, + "epoch": 0.9618228822220887, + "flos": 20267013427200.0, + "grad_norm": 1.8149500855008687, + "language_loss": 0.79080558, + "learning_rate": 1.5224746212897378e-08, + "loss": 0.81298786, + "num_input_tokens_seen": 172646125, + "step": 7999, + "time_per_iteration": 2.6879971027374268 + }, + { + "auxiliary_loss_clip": 0.01134056, + "auxiliary_loss_mlp": 0.01083776, + "balance_loss_clip": 1.025967, + "balance_loss_mlp": 1.00366759, + "epoch": 0.9619431251127277, + "flos": 21031300039680.0, + "grad_norm": 1.671580781509389, + "language_loss": 0.77276343, + "learning_rate": 1.512896363250804e-08, + "loss": 0.79494166, + "num_input_tokens_seen": 172666235, + "step": 8000, + "time_per_iteration": 2.7675511837005615 + }, + { + "auxiliary_loss_clip": 0.01125415, + "auxiliary_loss_mlp": 0.01084342, + "balance_loss_clip": 1.02426481, + "balance_loss_mlp": 1.00428104, + "epoch": 0.9620633680033668, + "flos": 22382654538240.0, + "grad_norm": 1.799337940034768, + "language_loss": 0.75483215, + "learning_rate": 1.503348215547673e-08, + "loss": 0.77692974, + "num_input_tokens_seen": 172687325, + "step": 8001, + "time_per_iteration": 3.735243558883667 + }, + { + "auxiliary_loss_clip": 0.0111512, + "auxiliary_loss_mlp": 0.01083387, + "balance_loss_clip": 1.02435374, + "balance_loss_mlp": 1.00337338, + "epoch": 0.962183610894006, + "flos": 18471730740480.0, + "grad_norm": 1.913850021211432, + "language_loss": 0.80990982, + "learning_rate": 1.4938301796288078e-08, + "loss": 0.83189499, + "num_input_tokens_seen": 172703895, + "step": 8002, + "time_per_iteration": 3.584242820739746 + }, + { + "auxiliary_loss_clip": 0.01134806, + "auxiliary_loss_mlp": 0.01082929, + "balance_loss_clip": 1.02614594, + "balance_loss_mlp": 1.00277269, + "epoch": 0.962303853784645, + "flos": 18435245500800.0, + "grad_norm": 2.624784050925105, + "language_loss": 0.81778049, + "learning_rate": 1.4843422569380537e-08, + "loss": 0.83995783, + "num_input_tokens_seen": 172720650, + "step": 8003, + "time_per_iteration": 2.6232423782348633 + }, + { + "auxiliary_loss_clip": 0.01074694, + "auxiliary_loss_mlp": 0.01083974, + "balance_loss_clip": 1.02227902, + "balance_loss_mlp": 1.00386572, + "epoch": 0.9624240966752841, + "flos": 26391074826240.0, + "grad_norm": 1.8909918385179738, + "language_loss": 0.82659334, + "learning_rate": 1.4748844489147483e-08, + "loss": 0.84818006, + "num_input_tokens_seen": 172737640, + "step": 8004, + "time_per_iteration": 2.8723371028900146 + }, + { + "auxiliary_loss_clip": 0.01114914, + "auxiliary_loss_mlp": 0.01083018, + "balance_loss_clip": 1.02334011, + "balance_loss_mlp": 1.00295663, + "epoch": 0.9625443395659231, + "flos": 14647675985280.0, + "grad_norm": 1.7554517837505808, + "language_loss": 0.7062642, + "learning_rate": 1.4654567569936326e-08, + "loss": 0.72824353, + "num_input_tokens_seen": 172755215, + "step": 8005, + "time_per_iteration": 2.6998250484466553 + }, + { + "auxiliary_loss_clip": 0.0109744, + "auxiliary_loss_mlp": 0.01086531, + "balance_loss_clip": 1.02365863, + "balance_loss_mlp": 1.0064224, + "epoch": 0.9626645824565623, + "flos": 18367626147840.0, + "grad_norm": 1.7220224880247161, + "language_loss": 0.83117217, + "learning_rate": 1.456059182604874e-08, + "loss": 0.85301185, + "num_input_tokens_seen": 172774020, + "step": 8006, + "time_per_iteration": 2.79076886177063 + }, + { + "auxiliary_loss_clip": 0.01136822, + "auxiliary_loss_mlp": 0.01084815, + "balance_loss_clip": 1.0277698, + "balance_loss_mlp": 1.00461125, + "epoch": 0.9627848253472013, + "flos": 16580424021120.0, + "grad_norm": 1.7296821033237828, + "language_loss": 0.76416272, + "learning_rate": 1.4466917271740653e-08, + "loss": 0.7863791, + "num_input_tokens_seen": 172792220, + "step": 8007, + "time_per_iteration": 2.6201162338256836 + }, + { + "auxiliary_loss_clip": 0.01117647, + "auxiliary_loss_mlp": 0.010837, + "balance_loss_clip": 1.02602577, + "balance_loss_mlp": 1.00349617, + "epoch": 0.9629050682378404, + "flos": 20886867452160.0, + "grad_norm": 1.9865911124022806, + "language_loss": 0.6791653, + "learning_rate": 1.4373543921222697e-08, + "loss": 0.70117879, + "num_input_tokens_seen": 172811805, + "step": 8008, + "time_per_iteration": 2.7471156120300293 + }, + { + "auxiliary_loss_clip": 0.01108488, + "auxiliary_loss_mlp": 0.01084053, + "balance_loss_clip": 1.02327704, + "balance_loss_mlp": 1.00389671, + "epoch": 0.9630253111284796, + "flos": 17019252478080.0, + "grad_norm": 1.6391607946659381, + "language_loss": 0.78022879, + "learning_rate": 1.428047178865932e-08, + "loss": 0.80215418, + "num_input_tokens_seen": 172828595, + "step": 8009, + "time_per_iteration": 2.685760259628296 + }, + { + "auxiliary_loss_clip": 0.01116646, + "auxiliary_loss_mlp": 0.01083299, + "balance_loss_clip": 1.02334297, + "balance_loss_mlp": 1.00314236, + "epoch": 0.9631455540191186, + "flos": 20338942412160.0, + "grad_norm": 1.793157950099985, + "language_loss": 0.74237597, + "learning_rate": 1.4187700888169451e-08, + "loss": 0.76437539, + "num_input_tokens_seen": 172847770, + "step": 8010, + "time_per_iteration": 2.7462828159332275 + }, + { + "auxiliary_loss_clip": 0.01104299, + "auxiliary_loss_mlp": 0.0107874, + "balance_loss_clip": 1.01755929, + "balance_loss_mlp": 0.99977535, + "epoch": 0.9632657969097577, + "flos": 65956700033280.0, + "grad_norm": 0.7446860793802342, + "language_loss": 0.57026684, + "learning_rate": 1.40952312338265e-08, + "loss": 0.59209722, + "num_input_tokens_seen": 172912415, + "step": 8011, + "time_per_iteration": 3.277940273284912 + }, + { + "auxiliary_loss_clip": 0.01107485, + "auxiliary_loss_mlp": 0.01083517, + "balance_loss_clip": 1.02357554, + "balance_loss_mlp": 1.00340891, + "epoch": 0.9633860398003968, + "flos": 44419523823360.0, + "grad_norm": 1.8303203046726946, + "language_loss": 0.68578547, + "learning_rate": 1.4003062839657909e-08, + "loss": 0.70769548, + "num_input_tokens_seen": 172934895, + "step": 8012, + "time_per_iteration": 2.9605519771575928 + }, + { + "auxiliary_loss_clip": 0.01087672, + "auxiliary_loss_mlp": 0.01083687, + "balance_loss_clip": 1.02185476, + "balance_loss_mlp": 1.00353026, + "epoch": 0.9635062826910359, + "flos": 24827704300800.0, + "grad_norm": 1.5484266760146115, + "language_loss": 0.80098838, + "learning_rate": 1.391119571964583e-08, + "loss": 0.82270199, + "num_input_tokens_seen": 172955835, + "step": 8013, + "time_per_iteration": 2.8381917476654053 + }, + { + "auxiliary_loss_clip": 0.01125247, + "auxiliary_loss_mlp": 0.01084885, + "balance_loss_clip": 1.02472174, + "balance_loss_mlp": 1.0048722, + "epoch": 0.9636265255816749, + "flos": 15961360095360.0, + "grad_norm": 1.6330825821736803, + "language_loss": 0.73042393, + "learning_rate": 1.3819629887726225e-08, + "loss": 0.75252533, + "num_input_tokens_seen": 172973925, + "step": 8014, + "time_per_iteration": 2.670038938522339 + }, + { + "auxiliary_loss_clip": 0.01116832, + "auxiliary_loss_mlp": 0.0108389, + "balance_loss_clip": 1.02440047, + "balance_loss_mlp": 1.00373316, + "epoch": 0.9637467684723141, + "flos": 22601781457920.0, + "grad_norm": 2.2053941007702185, + "language_loss": 0.76322424, + "learning_rate": 1.3728365357789317e-08, + "loss": 0.78523147, + "num_input_tokens_seen": 172993290, + "step": 8015, + "time_per_iteration": 2.772367477416992 + }, + { + "auxiliary_loss_clip": 0.01072282, + "auxiliary_loss_mlp": 0.01084569, + "balance_loss_clip": 1.02320874, + "balance_loss_mlp": 1.00441277, + "epoch": 0.9638670113629532, + "flos": 17565812801280.0, + "grad_norm": 2.5913413516565016, + "language_loss": 0.76239145, + "learning_rate": 1.3637402143680254e-08, + "loss": 0.78395993, + "num_input_tokens_seen": 173008190, + "step": 8016, + "time_per_iteration": 2.978569746017456 + }, + { + "auxiliary_loss_clip": 0.01066214, + "auxiliary_loss_mlp": 0.01078762, + "balance_loss_clip": 1.02166033, + "balance_loss_mlp": 0.99979764, + "epoch": 0.9639872542535922, + "flos": 55072139379840.0, + "grad_norm": 0.724720702916453, + "language_loss": 0.55095953, + "learning_rate": 1.3546740259197998e-08, + "loss": 0.57240927, + "num_input_tokens_seen": 173061000, + "step": 8017, + "time_per_iteration": 3.2864456176757812 + }, + { + "auxiliary_loss_clip": 0.01110854, + "auxiliary_loss_mlp": 0.01083964, + "balance_loss_clip": 1.02453816, + "balance_loss_mlp": 1.00385499, + "epoch": 0.9641074971442314, + "flos": 24134484746880.0, + "grad_norm": 2.0069982950209924, + "language_loss": 0.69988787, + "learning_rate": 1.3456379718095989e-08, + "loss": 0.72183603, + "num_input_tokens_seen": 173081415, + "step": 8018, + "time_per_iteration": 2.7750492095947266 + }, + { + "auxiliary_loss_clip": 0.01098724, + "auxiliary_loss_mlp": 0.01078987, + "balance_loss_clip": 1.02005374, + "balance_loss_mlp": 1.00002277, + "epoch": 0.9642277400348704, + "flos": 66747416077440.0, + "grad_norm": 0.8483225000111225, + "language_loss": 0.62096018, + "learning_rate": 1.3366320534081487e-08, + "loss": 0.64273727, + "num_input_tokens_seen": 173144095, + "step": 8019, + "time_per_iteration": 3.324643135070801 + }, + { + "auxiliary_loss_clip": 0.01127285, + "auxiliary_loss_mlp": 0.01084075, + "balance_loss_clip": 1.02613735, + "balance_loss_mlp": 1.00391889, + "epoch": 0.9643479829255095, + "flos": 30920272450560.0, + "grad_norm": 1.988307168983116, + "language_loss": 0.75885701, + "learning_rate": 1.3276562720816675e-08, + "loss": 0.78097057, + "num_input_tokens_seen": 173165605, + "step": 8020, + "time_per_iteration": 2.80562424659729 + }, + { + "auxiliary_loss_clip": 0.0113393, + "auxiliary_loss_mlp": 0.01083637, + "balance_loss_clip": 1.02440453, + "balance_loss_mlp": 1.00348067, + "epoch": 0.9644682258161487, + "flos": 20048245643520.0, + "grad_norm": 2.1602370374753805, + "language_loss": 0.82437074, + "learning_rate": 1.3187106291917549e-08, + "loss": 0.84654647, + "num_input_tokens_seen": 173182595, + "step": 8021, + "time_per_iteration": 2.6422946453094482 + }, + { + "auxiliary_loss_clip": 0.01124116, + "auxiliary_loss_mlp": 0.01084115, + "balance_loss_clip": 1.02429628, + "balance_loss_mlp": 1.00405359, + "epoch": 0.9645884687067877, + "flos": 21178713456000.0, + "grad_norm": 1.800458622416369, + "language_loss": 0.7063868, + "learning_rate": 1.309795126095503e-08, + "loss": 0.72846913, + "num_input_tokens_seen": 173200895, + "step": 8022, + "time_per_iteration": 3.6440281867980957 + }, + { + "auxiliary_loss_clip": 0.01072682, + "auxiliary_loss_mlp": 0.01085783, + "balance_loss_clip": 1.01748073, + "balance_loss_mlp": 1.00572169, + "epoch": 0.9647087115974268, + "flos": 18945967029120.0, + "grad_norm": 1.9696637658392795, + "language_loss": 0.80602288, + "learning_rate": 1.3009097641453192e-08, + "loss": 0.82760751, + "num_input_tokens_seen": 173218745, + "step": 8023, + "time_per_iteration": 2.949474334716797 + }, + { + "auxiliary_loss_clip": 0.01110024, + "auxiliary_loss_mlp": 0.01083781, + "balance_loss_clip": 1.02031124, + "balance_loss_mlp": 1.00367188, + "epoch": 0.9648289544880659, + "flos": 16545088016640.0, + "grad_norm": 1.637512912191642, + "language_loss": 0.7593444, + "learning_rate": 1.2920545446891474e-08, + "loss": 0.78128242, + "num_input_tokens_seen": 173235465, + "step": 8024, + "time_per_iteration": 2.784830093383789 + }, + { + "auxiliary_loss_clip": 0.01116916, + "auxiliary_loss_mlp": 0.01083498, + "balance_loss_clip": 1.02542126, + "balance_loss_mlp": 1.00338888, + "epoch": 0.964949197378705, + "flos": 24057527857920.0, + "grad_norm": 1.6800247074242092, + "language_loss": 0.70890862, + "learning_rate": 1.2832294690703127e-08, + "loss": 0.7309128, + "num_input_tokens_seen": 173254440, + "step": 8025, + "time_per_iteration": 2.74524188041687 + }, + { + "auxiliary_loss_clip": 0.01125646, + "auxiliary_loss_mlp": 0.01083971, + "balance_loss_clip": 1.02521038, + "balance_loss_mlp": 1.00390959, + "epoch": 0.965069440269344, + "flos": 23365565280000.0, + "grad_norm": 3.243902887161899, + "language_loss": 0.77691823, + "learning_rate": 1.2744345386275668e-08, + "loss": 0.79901433, + "num_input_tokens_seen": 173273980, + "step": 8026, + "time_per_iteration": 3.655268669128418 + }, + { + "auxiliary_loss_clip": 0.01101072, + "auxiliary_loss_mlp": 0.01084005, + "balance_loss_clip": 1.02579582, + "balance_loss_mlp": 1.00389624, + "epoch": 0.9651896831599832, + "flos": 25374875155200.0, + "grad_norm": 1.724804698919208, + "language_loss": 0.78505683, + "learning_rate": 1.265669754695109e-08, + "loss": 0.80690759, + "num_input_tokens_seen": 173293550, + "step": 8027, + "time_per_iteration": 3.670095682144165 + }, + { + "auxiliary_loss_clip": 0.01084426, + "auxiliary_loss_mlp": 0.01084348, + "balance_loss_clip": 1.01863587, + "balance_loss_mlp": 1.00414419, + "epoch": 0.9653099260506223, + "flos": 22272875596800.0, + "grad_norm": 1.7779655589367418, + "language_loss": 0.82289505, + "learning_rate": 1.2569351186025201e-08, + "loss": 0.8445828, + "num_input_tokens_seen": 173312005, + "step": 8028, + "time_per_iteration": 2.997544288635254 + }, + { + "auxiliary_loss_clip": 0.01104875, + "auxiliary_loss_mlp": 0.01083206, + "balance_loss_clip": 1.02236342, + "balance_loss_mlp": 1.00324059, + "epoch": 0.9654301689412613, + "flos": 26760847386240.0, + "grad_norm": 1.4065250540512566, + "language_loss": 0.75461853, + "learning_rate": 1.2482306316748737e-08, + "loss": 0.77649933, + "num_input_tokens_seen": 173332450, + "step": 8029, + "time_per_iteration": 2.8081228733062744 + }, + { + "auxiliary_loss_clip": 0.01126378, + "auxiliary_loss_mlp": 0.01084714, + "balance_loss_clip": 1.02449584, + "balance_loss_mlp": 1.00455809, + "epoch": 0.9655504118319005, + "flos": 17412689122560.0, + "grad_norm": 2.063591532640947, + "language_loss": 0.7834996, + "learning_rate": 1.2395562952326021e-08, + "loss": 0.80561054, + "num_input_tokens_seen": 173349610, + "step": 8030, + "time_per_iteration": 2.682874917984009 + }, + { + "auxiliary_loss_clip": 0.01116808, + "auxiliary_loss_mlp": 0.01084028, + "balance_loss_clip": 1.0239166, + "balance_loss_mlp": 1.0039196, + "epoch": 0.9656706547225395, + "flos": 22126970551680.0, + "grad_norm": 1.9446202107351072, + "language_loss": 0.8155123, + "learning_rate": 1.2309121105916309e-08, + "loss": 0.8375206, + "num_input_tokens_seen": 173367900, + "step": 8031, + "time_per_iteration": 2.8465540409088135 + }, + { + "auxiliary_loss_clip": 0.01125851, + "auxiliary_loss_mlp": 0.0108464, + "balance_loss_clip": 1.02472758, + "balance_loss_mlp": 1.00453091, + "epoch": 0.9657908976131786, + "flos": 37049289926400.0, + "grad_norm": 1.969306963263684, + "language_loss": 0.69326723, + "learning_rate": 1.222298079063222e-08, + "loss": 0.71537215, + "num_input_tokens_seen": 173389040, + "step": 8032, + "time_per_iteration": 2.7873003482818604 + }, + { + "auxiliary_loss_clip": 0.01124755, + "auxiliary_loss_mlp": 0.0108395, + "balance_loss_clip": 1.02442169, + "balance_loss_mlp": 1.00398457, + "epoch": 0.9659111405038178, + "flos": 24389809597440.0, + "grad_norm": 2.0025281699548314, + "language_loss": 0.7278142, + "learning_rate": 1.2137142019541524e-08, + "loss": 0.74990129, + "num_input_tokens_seen": 173407595, + "step": 8033, + "time_per_iteration": 2.7884340286254883 + }, + { + "auxiliary_loss_clip": 0.0110315, + "auxiliary_loss_mlp": 0.0108522, + "balance_loss_clip": 1.02659047, + "balance_loss_mlp": 1.00511098, + "epoch": 0.9660313833944568, + "flos": 25009412227200.0, + "grad_norm": 2.250085681362772, + "language_loss": 0.73556131, + "learning_rate": 1.2051604805666027e-08, + "loss": 0.75744498, + "num_input_tokens_seen": 173424720, + "step": 8034, + "time_per_iteration": 2.76479434967041 + }, + { + "auxiliary_loss_clip": 0.01135035, + "auxiliary_loss_mlp": 0.00872838, + "balance_loss_clip": 1.02627695, + "balance_loss_mlp": 1.00012255, + "epoch": 0.9661516262850959, + "flos": 11801575895040.0, + "grad_norm": 2.485753991805737, + "language_loss": 0.77928054, + "learning_rate": 1.196636916198135e-08, + "loss": 0.79935932, + "num_input_tokens_seen": 173442260, + "step": 8035, + "time_per_iteration": 2.571460008621216 + }, + { + "auxiliary_loss_clip": 0.0113374, + "auxiliary_loss_mlp": 0.01083714, + "balance_loss_clip": 1.02434611, + "balance_loss_mlp": 1.00360537, + "epoch": 0.9662718691757349, + "flos": 20047778766720.0, + "grad_norm": 1.8372047056466836, + "language_loss": 0.76771086, + "learning_rate": 1.1881435101418036e-08, + "loss": 0.7898854, + "num_input_tokens_seen": 173461675, + "step": 8036, + "time_per_iteration": 2.6669046878814697 + }, + { + "auxiliary_loss_clip": 0.01094901, + "auxiliary_loss_mlp": 0.01078925, + "balance_loss_clip": 1.01643968, + "balance_loss_mlp": 0.99996072, + "epoch": 0.9663921120663741, + "flos": 68027703517440.0, + "grad_norm": 0.7272788928376945, + "language_loss": 0.65564072, + "learning_rate": 1.1796802636860003e-08, + "loss": 0.67737901, + "num_input_tokens_seen": 173530205, + "step": 8037, + "time_per_iteration": 3.3325023651123047 + }, + { + "auxiliary_loss_clip": 0.01135214, + "auxiliary_loss_mlp": 0.01083333, + "balance_loss_clip": 1.02581048, + "balance_loss_mlp": 1.00327206, + "epoch": 0.9665123549570132, + "flos": 26322916769280.0, + "grad_norm": 2.0474131586400963, + "language_loss": 0.73643988, + "learning_rate": 1.1712471781146316e-08, + "loss": 0.75862533, + "num_input_tokens_seen": 173549540, + "step": 8038, + "time_per_iteration": 2.5928986072540283 + }, + { + "auxiliary_loss_clip": 0.01134532, + "auxiliary_loss_mlp": 0.01083511, + "balance_loss_clip": 1.02532458, + "balance_loss_mlp": 1.0034976, + "epoch": 0.9666325978476522, + "flos": 43941121557120.0, + "grad_norm": 1.7181884051501781, + "language_loss": 0.66854286, + "learning_rate": 1.1628442547069628e-08, + "loss": 0.6907233, + "num_input_tokens_seen": 173571740, + "step": 8039, + "time_per_iteration": 2.8579773902893066 + }, + { + "auxiliary_loss_clip": 0.01125859, + "auxiliary_loss_mlp": 0.00872906, + "balance_loss_clip": 1.02503777, + "balance_loss_mlp": 1.00009763, + "epoch": 0.9667528407382914, + "flos": 21543422198400.0, + "grad_norm": 1.7942099994350964, + "language_loss": 0.77254206, + "learning_rate": 1.1544714947377521e-08, + "loss": 0.7925297, + "num_input_tokens_seen": 173589425, + "step": 8040, + "time_per_iteration": 2.6996543407440186 + }, + { + "auxiliary_loss_clip": 0.0113676, + "auxiliary_loss_mlp": 0.01084119, + "balance_loss_clip": 1.02759254, + "balance_loss_mlp": 1.00381994, + "epoch": 0.9668730836289304, + "flos": 23878585278720.0, + "grad_norm": 1.8630508024797652, + "language_loss": 0.69673526, + "learning_rate": 1.1461288994770945e-08, + "loss": 0.71894407, + "num_input_tokens_seen": 173608500, + "step": 8041, + "time_per_iteration": 2.814807176589966 + }, + { + "auxiliary_loss_clip": 0.01134601, + "auxiliary_loss_mlp": 0.01083987, + "balance_loss_clip": 1.02520037, + "balance_loss_mlp": 1.00378311, + "epoch": 0.9669933265195695, + "flos": 28293011971200.0, + "grad_norm": 1.8790056514643136, + "language_loss": 0.77170867, + "learning_rate": 1.1378164701906002e-08, + "loss": 0.79389453, + "num_input_tokens_seen": 173630265, + "step": 8042, + "time_per_iteration": 2.7507131099700928 + }, + { + "auxiliary_loss_clip": 0.01135422, + "auxiliary_loss_mlp": 0.01084664, + "balance_loss_clip": 1.02605796, + "balance_loss_mlp": 1.0045079, + "epoch": 0.9671135694102087, + "flos": 22454763091200.0, + "grad_norm": 1.650384033060725, + "language_loss": 0.66460818, + "learning_rate": 1.1295342081392156e-08, + "loss": 0.68680906, + "num_input_tokens_seen": 173649625, + "step": 8043, + "time_per_iteration": 2.7076969146728516 + }, + { + "auxiliary_loss_clip": 0.01099859, + "auxiliary_loss_mlp": 0.01083683, + "balance_loss_clip": 1.02451909, + "balance_loss_mlp": 1.00362217, + "epoch": 0.9672338123008477, + "flos": 20155941596160.0, + "grad_norm": 1.8964998899660785, + "language_loss": 0.69282687, + "learning_rate": 1.1212821145793804e-08, + "loss": 0.71466219, + "num_input_tokens_seen": 173669240, + "step": 8044, + "time_per_iteration": 2.7623279094696045 + }, + { + "auxiliary_loss_clip": 0.01114806, + "auxiliary_loss_mlp": 0.01084353, + "balance_loss_clip": 1.02321136, + "balance_loss_mlp": 1.0043875, + "epoch": 0.9673540551914868, + "flos": 16977487939200.0, + "grad_norm": 3.087150256192362, + "language_loss": 0.78884703, + "learning_rate": 1.1130601907629156e-08, + "loss": 0.81083858, + "num_input_tokens_seen": 173686970, + "step": 8045, + "time_per_iteration": 2.6658897399902344 + }, + { + "auxiliary_loss_clip": 0.0110473, + "auxiliary_loss_mlp": 0.01079027, + "balance_loss_clip": 1.01740658, + "balance_loss_mlp": 1.00006294, + "epoch": 0.9674742980821259, + "flos": 61892903952000.0, + "grad_norm": 0.8073333449643283, + "language_loss": 0.64808977, + "learning_rate": 1.1048684379370899e-08, + "loss": 0.66992736, + "num_input_tokens_seen": 173747655, + "step": 8046, + "time_per_iteration": 3.2069168090820312 + }, + { + "auxiliary_loss_clip": 0.01116484, + "auxiliary_loss_mlp": 0.01083523, + "balance_loss_clip": 1.02523637, + "balance_loss_mlp": 1.00355744, + "epoch": 0.967594540972765, + "flos": 18697824898560.0, + "grad_norm": 1.908667268139591, + "language_loss": 0.74373329, + "learning_rate": 1.0967068573445759e-08, + "loss": 0.76573336, + "num_input_tokens_seen": 173765140, + "step": 8047, + "time_per_iteration": 3.5757031440734863 + }, + { + "auxiliary_loss_clip": 0.01116173, + "auxiliary_loss_mlp": 0.01084398, + "balance_loss_clip": 1.02454627, + "balance_loss_mlp": 1.00428963, + "epoch": 0.967714783863404, + "flos": 20777411733120.0, + "grad_norm": 2.198148014874131, + "language_loss": 0.65067929, + "learning_rate": 1.0885754502234945e-08, + "loss": 0.67268503, + "num_input_tokens_seen": 173784800, + "step": 8048, + "time_per_iteration": 3.6078429222106934 + }, + { + "auxiliary_loss_clip": 0.01081258, + "auxiliary_loss_mlp": 0.01083185, + "balance_loss_clip": 1.02155876, + "balance_loss_mlp": 1.00307679, + "epoch": 0.9678350267540432, + "flos": 23185473465600.0, + "grad_norm": 1.84890488594028, + "language_loss": 0.77776456, + "learning_rate": 1.08047421780737e-08, + "loss": 0.79940891, + "num_input_tokens_seen": 173803990, + "step": 8049, + "time_per_iteration": 2.7761893272399902 + }, + { + "auxiliary_loss_clip": 0.01117986, + "auxiliary_loss_mlp": 0.00872933, + "balance_loss_clip": 1.02471817, + "balance_loss_mlp": 1.00010073, + "epoch": 0.9679552696446823, + "flos": 21726063878400.0, + "grad_norm": 2.4981915454378263, + "language_loss": 0.74052805, + "learning_rate": 1.0724031613251305e-08, + "loss": 0.76043725, + "num_input_tokens_seen": 173821890, + "step": 8050, + "time_per_iteration": 2.7240946292877197 + }, + { + "auxiliary_loss_clip": 0.01125819, + "auxiliary_loss_mlp": 0.01084615, + "balance_loss_clip": 1.02440429, + "balance_loss_mlp": 1.00436342, + "epoch": 0.9680755125353213, + "flos": 26869046129280.0, + "grad_norm": 1.9087596059732035, + "language_loss": 0.66465676, + "learning_rate": 1.0643622820011744e-08, + "loss": 0.68676108, + "num_input_tokens_seen": 173842945, + "step": 8051, + "time_per_iteration": 3.646714687347412 + }, + { + "auxiliary_loss_clip": 0.01136107, + "auxiliary_loss_mlp": 0.01085528, + "balance_loss_clip": 1.02631259, + "balance_loss_mlp": 1.00537205, + "epoch": 0.9681957554259605, + "flos": 28325008010880.0, + "grad_norm": 2.640832050133024, + "language_loss": 0.67782784, + "learning_rate": 1.0563515810552814e-08, + "loss": 0.70004416, + "num_input_tokens_seen": 173859915, + "step": 8052, + "time_per_iteration": 2.733527898788452 + }, + { + "auxiliary_loss_clip": 0.01136629, + "auxiliary_loss_mlp": 0.01085049, + "balance_loss_clip": 1.02711058, + "balance_loss_mlp": 1.00489283, + "epoch": 0.9683159983165995, + "flos": 20557674282240.0, + "grad_norm": 1.4871560008967388, + "language_loss": 0.73226106, + "learning_rate": 1.0483710597026795e-08, + "loss": 0.75447786, + "num_input_tokens_seen": 173879775, + "step": 8053, + "time_per_iteration": 3.599792003631592 + }, + { + "auxiliary_loss_clip": 0.01106473, + "auxiliary_loss_mlp": 0.01084292, + "balance_loss_clip": 1.02420974, + "balance_loss_mlp": 1.00423145, + "epoch": 0.9684362412072386, + "flos": 24207958016640.0, + "grad_norm": 2.045562768965655, + "language_loss": 0.73907769, + "learning_rate": 1.0404207191540227e-08, + "loss": 0.76098537, + "num_input_tokens_seen": 173900230, + "step": 8054, + "time_per_iteration": 2.824807643890381 + }, + { + "auxiliary_loss_clip": 0.01134183, + "auxiliary_loss_mlp": 0.01084631, + "balance_loss_clip": 1.02480567, + "balance_loss_mlp": 1.00447488, + "epoch": 0.9685564840978778, + "flos": 22346241125760.0, + "grad_norm": 3.102694014742797, + "language_loss": 0.74824011, + "learning_rate": 1.0325005606153236e-08, + "loss": 0.77042824, + "num_input_tokens_seen": 173919690, + "step": 8055, + "time_per_iteration": 2.6943836212158203 + }, + { + "auxiliary_loss_clip": 0.01083359, + "auxiliary_loss_mlp": 0.01083869, + "balance_loss_clip": 1.02453279, + "balance_loss_mlp": 1.00371265, + "epoch": 0.9686767269885168, + "flos": 14386389477120.0, + "grad_norm": 2.641377024760972, + "language_loss": 0.79413325, + "learning_rate": 1.0246105852881104e-08, + "loss": 0.81580555, + "num_input_tokens_seen": 173934790, + "step": 8056, + "time_per_iteration": 2.8162667751312256 + }, + { + "auxiliary_loss_clip": 0.0113514, + "auxiliary_loss_mlp": 0.01084875, + "balance_loss_clip": 1.02535725, + "balance_loss_mlp": 1.00467086, + "epoch": 0.9687969698791559, + "flos": 21287630471040.0, + "grad_norm": 1.7685574223746976, + "language_loss": 0.78339887, + "learning_rate": 1.0167507943692476e-08, + "loss": 0.80559903, + "num_input_tokens_seen": 173953875, + "step": 8057, + "time_per_iteration": 2.665515661239624 + }, + { + "auxiliary_loss_clip": 0.0111936, + "auxiliary_loss_mlp": 0.01084707, + "balance_loss_clip": 1.02504838, + "balance_loss_mlp": 1.0045507, + "epoch": 0.968917212769795, + "flos": 19828328624640.0, + "grad_norm": 2.816851120285781, + "language_loss": 0.71348107, + "learning_rate": 1.008921189051093e-08, + "loss": 0.73552173, + "num_input_tokens_seen": 173971220, + "step": 8058, + "time_per_iteration": 2.7081005573272705 + }, + { + "auxiliary_loss_clip": 0.01135523, + "auxiliary_loss_mlp": 0.01084144, + "balance_loss_clip": 1.02652597, + "balance_loss_mlp": 1.00403571, + "epoch": 0.9690374556604341, + "flos": 21681749473920.0, + "grad_norm": 1.977786871660418, + "language_loss": 0.77322221, + "learning_rate": 1.0011217705213848e-08, + "loss": 0.79541886, + "num_input_tokens_seen": 173989095, + "step": 8059, + "time_per_iteration": 2.74472713470459 + }, + { + "auxiliary_loss_clip": 0.01124973, + "auxiliary_loss_mlp": 0.01083779, + "balance_loss_clip": 1.02469325, + "balance_loss_mlp": 1.00386107, + "epoch": 0.9691576985510731, + "flos": 32635437851520.0, + "grad_norm": 1.6518041216892791, + "language_loss": 0.74461949, + "learning_rate": 9.933525399632658e-09, + "loss": 0.766707, + "num_input_tokens_seen": 174007330, + "step": 8060, + "time_per_iteration": 2.7577221393585205 + }, + { + "auxiliary_loss_clip": 0.0110903, + "auxiliary_loss_mlp": 0.01084472, + "balance_loss_clip": 1.01927328, + "balance_loss_mlp": 1.00426805, + "epoch": 0.9692779414417123, + "flos": 35663174040960.0, + "grad_norm": 1.6028969101027148, + "language_loss": 0.6485616, + "learning_rate": 9.856134985553488e-09, + "loss": 0.6704967, + "num_input_tokens_seen": 174027055, + "step": 8061, + "time_per_iteration": 2.824977397918701 + }, + { + "auxiliary_loss_clip": 0.01134793, + "auxiliary_loss_mlp": 0.01083689, + "balance_loss_clip": 1.02562082, + "balance_loss_mlp": 1.00358057, + "epoch": 0.9693981843323514, + "flos": 28366952117760.0, + "grad_norm": 1.5085867572679028, + "language_loss": 0.7329517, + "learning_rate": 9.77904647471628e-09, + "loss": 0.75513649, + "num_input_tokens_seen": 174050235, + "step": 8062, + "time_per_iteration": 2.6594574451446533 + }, + { + "auxiliary_loss_clip": 0.01096312, + "auxiliary_loss_mlp": 0.01083284, + "balance_loss_clip": 1.02304685, + "balance_loss_mlp": 1.0032227, + "epoch": 0.9695184272229904, + "flos": 23622865378560.0, + "grad_norm": 2.4093188169824296, + "language_loss": 0.73765033, + "learning_rate": 9.702259878815454e-09, + "loss": 0.75944626, + "num_input_tokens_seen": 174070560, + "step": 8063, + "time_per_iteration": 2.8431813716888428 + }, + { + "auxiliary_loss_clip": 0.01125698, + "auxiliary_loss_mlp": 0.01086356, + "balance_loss_clip": 1.02564025, + "balance_loss_mlp": 1.0060569, + "epoch": 0.9696386701136296, + "flos": 23294677789440.0, + "grad_norm": 2.140062093232698, + "language_loss": 0.74437368, + "learning_rate": 9.625775209499254e-09, + "loss": 0.76649427, + "num_input_tokens_seen": 174090565, + "step": 8064, + "time_per_iteration": 2.6611030101776123 + }, + { + "auxiliary_loss_clip": 0.0110768, + "auxiliary_loss_mlp": 0.0108377, + "balance_loss_clip": 1.02407026, + "balance_loss_mlp": 1.00366163, + "epoch": 0.9697589130042686, + "flos": 15121876360320.0, + "grad_norm": 2.429835022842789, + "language_loss": 0.74427849, + "learning_rate": 9.549592478370172e-09, + "loss": 0.76619297, + "num_input_tokens_seen": 174108745, + "step": 8065, + "time_per_iteration": 2.7650229930877686 + }, + { + "auxiliary_loss_clip": 0.01125214, + "auxiliary_loss_mlp": 0.01083243, + "balance_loss_clip": 1.0239737, + "balance_loss_mlp": 1.0032295, + "epoch": 0.9698791558949077, + "flos": 18879532824960.0, + "grad_norm": 1.8561181793104693, + "language_loss": 0.79211748, + "learning_rate": 9.473711696985632e-09, + "loss": 0.81420207, + "num_input_tokens_seen": 174128075, + "step": 8066, + "time_per_iteration": 2.765660285949707 + }, + { + "auxiliary_loss_clip": 0.01092423, + "auxiliary_loss_mlp": 0.01083827, + "balance_loss_clip": 1.02366769, + "balance_loss_mlp": 1.00367045, + "epoch": 0.9699993987855468, + "flos": 17931455297280.0, + "grad_norm": 2.004473321993591, + "language_loss": 0.75921881, + "learning_rate": 9.398132876856201e-09, + "loss": 0.7809813, + "num_input_tokens_seen": 174147040, + "step": 8067, + "time_per_iteration": 2.733154296875 + }, + { + "auxiliary_loss_clip": 0.01079481, + "auxiliary_loss_mlp": 0.0107959, + "balance_loss_clip": 1.01747584, + "balance_loss_mlp": 1.00062609, + "epoch": 0.9701196416761859, + "flos": 67182186297600.0, + "grad_norm": 0.7728009838183483, + "language_loss": 0.6086182, + "learning_rate": 9.322856029447379e-09, + "loss": 0.63020891, + "num_input_tokens_seen": 174208225, + "step": 8068, + "time_per_iteration": 3.2768502235412598 + }, + { + "auxiliary_loss_clip": 0.01133137, + "auxiliary_loss_mlp": 0.0108502, + "balance_loss_clip": 1.02446985, + "balance_loss_mlp": 1.00495887, + "epoch": 0.970239884566825, + "flos": 24277804012800.0, + "grad_norm": 1.8032074575458925, + "language_loss": 0.80374664, + "learning_rate": 9.247881166178695e-09, + "loss": 0.82592821, + "num_input_tokens_seen": 174226935, + "step": 8069, + "time_per_iteration": 2.677229642868042 + }, + { + "auxiliary_loss_clip": 0.01108555, + "auxiliary_loss_mlp": 0.01084875, + "balance_loss_clip": 1.02423573, + "balance_loss_mlp": 1.00486183, + "epoch": 0.970360127457464, + "flos": 25301689194240.0, + "grad_norm": 2.4012683321241726, + "language_loss": 0.76751363, + "learning_rate": 9.173208298423274e-09, + "loss": 0.7894479, + "num_input_tokens_seen": 174248140, + "step": 8070, + "time_per_iteration": 2.8173117637634277 + }, + { + "auxiliary_loss_clip": 0.01076889, + "auxiliary_loss_mlp": 0.00872885, + "balance_loss_clip": 1.0210762, + "balance_loss_mlp": 1.00011349, + "epoch": 0.9704803703481032, + "flos": 29572473398400.0, + "grad_norm": 1.5312346401022985, + "language_loss": 0.76303327, + "learning_rate": 9.09883743750961e-09, + "loss": 0.78253102, + "num_input_tokens_seen": 174271030, + "step": 8071, + "time_per_iteration": 2.8331246376037598 + }, + { + "auxiliary_loss_clip": 0.01114715, + "auxiliary_loss_mlp": 0.01083613, + "balance_loss_clip": 1.02386177, + "balance_loss_mlp": 1.00355244, + "epoch": 0.9706006132387422, + "flos": 17380046638080.0, + "grad_norm": 1.7435389933415266, + "language_loss": 0.83881915, + "learning_rate": 9.024768594719124e-09, + "loss": 0.86080241, + "num_input_tokens_seen": 174289410, + "step": 8072, + "time_per_iteration": 2.7714879512786865 + }, + { + "auxiliary_loss_clip": 0.01101326, + "auxiliary_loss_mlp": 0.01084298, + "balance_loss_clip": 1.0197978, + "balance_loss_mlp": 1.00414133, + "epoch": 0.9707208561293813, + "flos": 18186421011840.0, + "grad_norm": 2.009659758879607, + "language_loss": 0.72726172, + "learning_rate": 8.95100178128816e-09, + "loss": 0.74911797, + "num_input_tokens_seen": 174308550, + "step": 8073, + "time_per_iteration": 4.526783227920532 + }, + { + "auxiliary_loss_clip": 0.01118469, + "auxiliary_loss_mlp": 0.01084086, + "balance_loss_clip": 1.02545667, + "balance_loss_mlp": 1.00392962, + "epoch": 0.9708410990200205, + "flos": 31248388212480.0, + "grad_norm": 1.8552854131493255, + "language_loss": 0.7013818, + "learning_rate": 8.877537008407321e-09, + "loss": 0.72340739, + "num_input_tokens_seen": 174328600, + "step": 8074, + "time_per_iteration": 2.7700300216674805 + }, + { + "auxiliary_loss_clip": 0.01116575, + "auxiliary_loss_mlp": 0.01083185, + "balance_loss_clip": 1.02420795, + "balance_loss_mlp": 1.00321937, + "epoch": 0.9709613419106595, + "flos": 30554450386560.0, + "grad_norm": 1.6047485685942524, + "language_loss": 0.6865797, + "learning_rate": 8.804374287221028e-09, + "loss": 0.70857728, + "num_input_tokens_seen": 174349835, + "step": 8075, + "time_per_iteration": 2.8249294757843018 + }, + { + "auxiliary_loss_clip": 0.01108898, + "auxiliary_loss_mlp": 0.0108458, + "balance_loss_clip": 1.02397394, + "balance_loss_mlp": 1.00447142, + "epoch": 0.9710815848012986, + "flos": 23730166281600.0, + "grad_norm": 1.7981680991802644, + "language_loss": 0.84582275, + "learning_rate": 8.731513628827958e-09, + "loss": 0.8677575, + "num_input_tokens_seen": 174369200, + "step": 8076, + "time_per_iteration": 2.8610599040985107 + }, + { + "auxiliary_loss_clip": 0.01124582, + "auxiliary_loss_mlp": 0.0108441, + "balance_loss_clip": 1.02425349, + "balance_loss_mlp": 1.00430107, + "epoch": 0.9712018276919377, + "flos": 23761875012480.0, + "grad_norm": 1.8105919084249265, + "language_loss": 0.82680702, + "learning_rate": 8.658955044280825e-09, + "loss": 0.84889692, + "num_input_tokens_seen": 174388125, + "step": 8077, + "time_per_iteration": 3.6059911251068115 + }, + { + "auxiliary_loss_clip": 0.01120205, + "auxiliary_loss_mlp": 0.01083938, + "balance_loss_clip": 1.02139378, + "balance_loss_mlp": 1.00382996, + "epoch": 0.9713220705825768, + "flos": 23330983461120.0, + "grad_norm": 1.532897427378239, + "language_loss": 0.77561378, + "learning_rate": 8.586698544587268e-09, + "loss": 0.79765522, + "num_input_tokens_seen": 174409735, + "step": 8078, + "time_per_iteration": 3.624610662460327 + }, + { + "auxiliary_loss_clip": 0.01119392, + "auxiliary_loss_mlp": 0.01084254, + "balance_loss_clip": 1.02565992, + "balance_loss_mlp": 1.00409746, + "epoch": 0.9714423134732159, + "flos": 22200946611840.0, + "grad_norm": 1.9387356635183153, + "language_loss": 0.73962939, + "learning_rate": 8.514744140707853e-09, + "loss": 0.76166582, + "num_input_tokens_seen": 174428875, + "step": 8079, + "time_per_iteration": 2.731671094894409 + }, + { + "auxiliary_loss_clip": 0.01134888, + "auxiliary_loss_mlp": 0.01083772, + "balance_loss_clip": 1.02582288, + "balance_loss_mlp": 1.00375843, + "epoch": 0.971562556363855, + "flos": 20229917656320.0, + "grad_norm": 1.5062370897403305, + "language_loss": 0.76558489, + "learning_rate": 8.443091843558515e-09, + "loss": 0.78777146, + "num_input_tokens_seen": 174447960, + "step": 8080, + "time_per_iteration": 2.61366868019104 + }, + { + "auxiliary_loss_clip": 0.01115423, + "auxiliary_loss_mlp": 0.01083647, + "balance_loss_clip": 1.02402377, + "balance_loss_mlp": 1.003443, + "epoch": 0.9716827992544941, + "flos": 24970197553920.0, + "grad_norm": 1.9880608647160591, + "language_loss": 0.64637721, + "learning_rate": 8.37174166400878e-09, + "loss": 0.66836798, + "num_input_tokens_seen": 174463535, + "step": 8081, + "time_per_iteration": 2.7838234901428223 + }, + { + "auxiliary_loss_clip": 0.0113528, + "auxiliary_loss_mlp": 0.01083335, + "balance_loss_clip": 1.02613926, + "balance_loss_mlp": 1.00332212, + "epoch": 0.9718030421451331, + "flos": 24681476033280.0, + "grad_norm": 1.7954012022016055, + "language_loss": 0.85018408, + "learning_rate": 8.300693612881992e-09, + "loss": 0.87237024, + "num_input_tokens_seen": 174483600, + "step": 8082, + "time_per_iteration": 2.614701509475708 + }, + { + "auxiliary_loss_clip": 0.01124453, + "auxiliary_loss_mlp": 0.00872954, + "balance_loss_clip": 1.02482271, + "balance_loss_mlp": 1.00010312, + "epoch": 0.9719232850357723, + "flos": 22090700793600.0, + "grad_norm": 1.8235572516209682, + "language_loss": 0.81367356, + "learning_rate": 8.22994770095664e-09, + "loss": 0.83364761, + "num_input_tokens_seen": 174502175, + "step": 8083, + "time_per_iteration": 2.735975503921509 + }, + { + "auxiliary_loss_clip": 0.01110927, + "auxiliary_loss_mlp": 0.01085959, + "balance_loss_clip": 1.02247548, + "balance_loss_mlp": 1.00575483, + "epoch": 0.9720435279264114, + "flos": 23656908493440.0, + "grad_norm": 1.9248432019269908, + "language_loss": 0.75391078, + "learning_rate": 8.159503938964585e-09, + "loss": 0.77587962, + "num_input_tokens_seen": 174519495, + "step": 8084, + "time_per_iteration": 2.732797861099243 + }, + { + "auxiliary_loss_clip": 0.01106624, + "auxiliary_loss_mlp": 0.01084194, + "balance_loss_clip": 1.02434325, + "balance_loss_mlp": 1.00427628, + "epoch": 0.9721637708170504, + "flos": 28365910623360.0, + "grad_norm": 1.9141208364716085, + "language_loss": 0.70512193, + "learning_rate": 8.089362337592164e-09, + "loss": 0.72703016, + "num_input_tokens_seen": 174543120, + "step": 8085, + "time_per_iteration": 2.8435914516448975 + }, + { + "auxiliary_loss_clip": 0.01113656, + "auxiliary_loss_mlp": 0.01083627, + "balance_loss_clip": 1.02358532, + "balance_loss_mlp": 1.00347102, + "epoch": 0.9722840137076896, + "flos": 29130807767040.0, + "grad_norm": 3.089526918832514, + "language_loss": 0.71974528, + "learning_rate": 8.019522907479536e-09, + "loss": 0.74171805, + "num_input_tokens_seen": 174563480, + "step": 8086, + "time_per_iteration": 2.786231756210327 + }, + { + "auxiliary_loss_clip": 0.0112839, + "auxiliary_loss_mlp": 0.01083908, + "balance_loss_clip": 1.02711117, + "balance_loss_mlp": 1.00384688, + "epoch": 0.9724042565983286, + "flos": 19243954258560.0, + "grad_norm": 2.0397472955997014, + "language_loss": 0.77424687, + "learning_rate": 7.949985659221558e-09, + "loss": 0.79636991, + "num_input_tokens_seen": 174580745, + "step": 8087, + "time_per_iteration": 2.6982150077819824 + }, + { + "auxiliary_loss_clip": 0.01115345, + "auxiliary_loss_mlp": 0.01083106, + "balance_loss_clip": 1.02308559, + "balance_loss_mlp": 1.00294924, + "epoch": 0.9725244994889677, + "flos": 23039676161280.0, + "grad_norm": 1.9068866615155284, + "language_loss": 0.79327512, + "learning_rate": 7.880750603366904e-09, + "loss": 0.81525958, + "num_input_tokens_seen": 174599615, + "step": 8088, + "time_per_iteration": 2.872669219970703 + }, + { + "auxiliary_loss_clip": 0.01107478, + "auxiliary_loss_mlp": 0.01084477, + "balance_loss_clip": 1.0230608, + "balance_loss_mlp": 1.0042727, + "epoch": 0.9726447423796069, + "flos": 23367468700800.0, + "grad_norm": 1.7107192400330247, + "language_loss": 0.7970258, + "learning_rate": 7.811817750418282e-09, + "loss": 0.81894541, + "num_input_tokens_seen": 174618375, + "step": 8089, + "time_per_iteration": 2.8231441974639893 + }, + { + "auxiliary_loss_clip": 0.01098289, + "auxiliary_loss_mlp": 0.01084315, + "balance_loss_clip": 1.02308202, + "balance_loss_mlp": 1.00406361, + "epoch": 0.9727649852702459, + "flos": 26541648639360.0, + "grad_norm": 1.976395274362018, + "language_loss": 0.80052054, + "learning_rate": 7.743187110833105e-09, + "loss": 0.82234663, + "num_input_tokens_seen": 174641135, + "step": 8090, + "time_per_iteration": 2.8159525394439697 + }, + { + "auxiliary_loss_clip": 0.01115511, + "auxiliary_loss_mlp": 0.01083087, + "balance_loss_clip": 1.02286208, + "balance_loss_mlp": 1.00302625, + "epoch": 0.972885228160885, + "flos": 20522338277760.0, + "grad_norm": 1.4486787617547539, + "language_loss": 0.80772454, + "learning_rate": 7.674858695022602e-09, + "loss": 0.82971054, + "num_input_tokens_seen": 174659490, + "step": 8091, + "time_per_iteration": 2.6633756160736084 + }, + { + "auxiliary_loss_clip": 0.01135457, + "auxiliary_loss_mlp": 0.01084049, + "balance_loss_clip": 1.0259254, + "balance_loss_mlp": 1.00389266, + "epoch": 0.9730054710515241, + "flos": 17566064196480.0, + "grad_norm": 2.8471267377246035, + "language_loss": 0.75801474, + "learning_rate": 7.606832513351591e-09, + "loss": 0.7802099, + "num_input_tokens_seen": 174677440, + "step": 8092, + "time_per_iteration": 2.6949737071990967 + }, + { + "auxiliary_loss_clip": 0.01112765, + "auxiliary_loss_mlp": 0.00872928, + "balance_loss_clip": 1.01760924, + "balance_loss_mlp": 1.0013032, + "epoch": 0.9731257139421632, + "flos": 68972010117120.0, + "grad_norm": 0.8234193924363103, + "language_loss": 0.63928008, + "learning_rate": 7.539108576140264e-09, + "loss": 0.65913701, + "num_input_tokens_seen": 174741550, + "step": 8093, + "time_per_iteration": 3.254889488220215 + }, + { + "auxiliary_loss_clip": 0.01094823, + "auxiliary_loss_mlp": 0.01084596, + "balance_loss_clip": 1.0223155, + "balance_loss_mlp": 1.00458312, + "epoch": 0.9732459568328022, + "flos": 18478841633280.0, + "grad_norm": 2.0825724443396623, + "language_loss": 0.70090926, + "learning_rate": 7.471686893661732e-09, + "loss": 0.72270346, + "num_input_tokens_seen": 174759845, + "step": 8094, + "time_per_iteration": 2.816053867340088 + }, + { + "auxiliary_loss_clip": 0.01114388, + "auxiliary_loss_mlp": 0.0108387, + "balance_loss_clip": 1.02444577, + "balance_loss_mlp": 1.00380921, + "epoch": 0.9733661997234414, + "flos": 20883886623360.0, + "grad_norm": 2.000584418546213, + "language_loss": 0.63985753, + "learning_rate": 7.4045674761442636e-09, + "loss": 0.66184008, + "num_input_tokens_seen": 174777175, + "step": 8095, + "time_per_iteration": 2.6776461601257324 + }, + { + "auxiliary_loss_clip": 0.01134785, + "auxiliary_loss_mlp": 0.00872885, + "balance_loss_clip": 1.02567112, + "balance_loss_mlp": 1.00007463, + "epoch": 0.9734864426140805, + "flos": 23766795175680.0, + "grad_norm": 1.690853336426448, + "language_loss": 0.74074066, + "learning_rate": 7.337750333769488e-09, + "loss": 0.76081729, + "num_input_tokens_seen": 174796980, + "step": 8096, + "time_per_iteration": 2.6937437057495117 + }, + { + "auxiliary_loss_clip": 0.01119159, + "auxiliary_loss_mlp": 0.01083888, + "balance_loss_clip": 1.02532053, + "balance_loss_mlp": 1.00373197, + "epoch": 0.9736066855047195, + "flos": 35042422176000.0, + "grad_norm": 1.7447417135250045, + "language_loss": 0.72670048, + "learning_rate": 7.2712354766737425e-09, + "loss": 0.74873102, + "num_input_tokens_seen": 174817310, + "step": 8097, + "time_per_iteration": 2.8684065341949463 + }, + { + "auxiliary_loss_clip": 0.01102025, + "auxiliary_loss_mlp": 0.01083345, + "balance_loss_clip": 1.02129459, + "balance_loss_mlp": 1.00318861, + "epoch": 0.9737269283953586, + "flos": 20410620001920.0, + "grad_norm": 1.510491986523042, + "language_loss": 0.80845737, + "learning_rate": 7.2050229149469565e-09, + "loss": 0.83031106, + "num_input_tokens_seen": 174837320, + "step": 8098, + "time_per_iteration": 3.7470643520355225 + }, + { + "auxiliary_loss_clip": 0.01109013, + "auxiliary_loss_mlp": 0.01083664, + "balance_loss_clip": 1.02391648, + "balance_loss_mlp": 1.00355577, + "epoch": 0.9738471712859977, + "flos": 28911680847360.0, + "grad_norm": 1.7683775985476307, + "language_loss": 0.63839155, + "learning_rate": 7.139112658633984e-09, + "loss": 0.66031837, + "num_input_tokens_seen": 174857470, + "step": 8099, + "time_per_iteration": 3.7478837966918945 + }, + { + "auxiliary_loss_clip": 0.0110353, + "auxiliary_loss_mlp": 0.010842, + "balance_loss_clip": 1.02186251, + "balance_loss_mlp": 1.00409102, + "epoch": 0.9739674141766368, + "flos": 27782326356480.0, + "grad_norm": 2.820252860340063, + "language_loss": 0.6988287, + "learning_rate": 7.073504717733048e-09, + "loss": 0.72070599, + "num_input_tokens_seen": 174877035, + "step": 8100, + "time_per_iteration": 2.8313400745391846 + }, + { + "auxiliary_loss_clip": 0.01058597, + "auxiliary_loss_mlp": 0.01079395, + "balance_loss_clip": 1.02168345, + "balance_loss_mlp": 1.00043058, + "epoch": 0.9740876570672758, + "flos": 68863057188480.0, + "grad_norm": 0.7395913470202423, + "language_loss": 0.57234794, + "learning_rate": 7.008199102196855e-09, + "loss": 0.59372783, + "num_input_tokens_seen": 174938460, + "step": 8101, + "time_per_iteration": 3.3258609771728516 + }, + { + "auxiliary_loss_clip": 0.01092698, + "auxiliary_loss_mlp": 0.01079186, + "balance_loss_clip": 1.01451278, + "balance_loss_mlp": 1.00022221, + "epoch": 0.974207899957915, + "flos": 58236622646400.0, + "grad_norm": 0.796773344933366, + "language_loss": 0.59019494, + "learning_rate": 6.9431958219321464e-09, + "loss": 0.61191368, + "num_input_tokens_seen": 174994625, + "step": 8102, + "time_per_iteration": 4.215846300125122 + }, + { + "auxiliary_loss_clip": 0.01116101, + "auxiliary_loss_mlp": 0.01084308, + "balance_loss_clip": 1.02370977, + "balance_loss_mlp": 1.00410366, + "epoch": 0.9743281428485541, + "flos": 22600057605120.0, + "grad_norm": 2.4640715900611125, + "language_loss": 0.7777065, + "learning_rate": 6.878494886800146e-09, + "loss": 0.79971057, + "num_input_tokens_seen": 175015400, + "step": 8103, + "time_per_iteration": 3.6597278118133545 + }, + { + "auxiliary_loss_clip": 0.01094015, + "auxiliary_loss_mlp": 0.01083912, + "balance_loss_clip": 1.02483189, + "balance_loss_mlp": 1.0038507, + "epoch": 0.9744483857391931, + "flos": 20008815488640.0, + "grad_norm": 2.595675843065722, + "language_loss": 0.76236755, + "learning_rate": 6.814096306615669e-09, + "loss": 0.78414679, + "num_input_tokens_seen": 175033540, + "step": 8104, + "time_per_iteration": 2.675126791000366 + }, + { + "auxiliary_loss_clip": 0.01115314, + "auxiliary_loss_mlp": 0.01083997, + "balance_loss_clip": 1.02245605, + "balance_loss_mlp": 1.00384092, + "epoch": 0.9745686286298323, + "flos": 17675268520320.0, + "grad_norm": 2.1343276330638554, + "language_loss": 0.65334451, + "learning_rate": 6.750000091148011e-09, + "loss": 0.67533755, + "num_input_tokens_seen": 175050835, + "step": 8105, + "time_per_iteration": 2.6988587379455566 + }, + { + "auxiliary_loss_clip": 0.01135285, + "auxiliary_loss_mlp": 0.01083894, + "balance_loss_clip": 1.02635288, + "balance_loss_mlp": 1.00378525, + "epoch": 0.9746888715204713, + "flos": 29460252332160.0, + "grad_norm": 2.5772135467457664, + "language_loss": 0.72500944, + "learning_rate": 6.686206250120729e-09, + "loss": 0.7472012, + "num_input_tokens_seen": 175072330, + "step": 8106, + "time_per_iteration": 2.699187755584717 + }, + { + "auxiliary_loss_clip": 0.01108291, + "auxiliary_loss_mlp": 0.01082979, + "balance_loss_clip": 1.02352452, + "balance_loss_mlp": 1.00296617, + "epoch": 0.9748091144111104, + "flos": 18479308510080.0, + "grad_norm": 1.6532051080372114, + "language_loss": 0.74592125, + "learning_rate": 6.622714793210749e-09, + "loss": 0.76783395, + "num_input_tokens_seen": 175091250, + "step": 8107, + "time_per_iteration": 2.8124945163726807 + }, + { + "auxiliary_loss_clip": 0.01135646, + "auxiliary_loss_mlp": 0.01083512, + "balance_loss_clip": 1.02629447, + "balance_loss_mlp": 1.00340331, + "epoch": 0.9749293573017496, + "flos": 20665154753280.0, + "grad_norm": 1.6321417451709475, + "language_loss": 0.78561151, + "learning_rate": 6.559525730050364e-09, + "loss": 0.80780309, + "num_input_tokens_seen": 175111350, + "step": 8108, + "time_per_iteration": 2.698840618133545 + }, + { + "auxiliary_loss_clip": 0.01106334, + "auxiliary_loss_mlp": 0.0108502, + "balance_loss_clip": 1.02351832, + "balance_loss_mlp": 1.00500643, + "epoch": 0.9750496001923886, + "flos": 18478590238080.0, + "grad_norm": 2.0338328807159725, + "language_loss": 0.75977075, + "learning_rate": 6.496639070224574e-09, + "loss": 0.78168434, + "num_input_tokens_seen": 175129835, + "step": 8109, + "time_per_iteration": 2.8368823528289795 + }, + { + "auxiliary_loss_clip": 0.0112754, + "auxiliary_loss_mlp": 0.01084719, + "balance_loss_clip": 1.0263207, + "balance_loss_mlp": 1.00460994, + "epoch": 0.9751698430830277, + "flos": 19572967860480.0, + "grad_norm": 2.147662355570754, + "language_loss": 0.84027594, + "learning_rate": 6.4340548232739714e-09, + "loss": 0.86239851, + "num_input_tokens_seen": 175146035, + "step": 8110, + "time_per_iteration": 2.7104222774505615 + }, + { + "auxiliary_loss_clip": 0.01110693, + "auxiliary_loss_mlp": 0.01084274, + "balance_loss_clip": 1.02574158, + "balance_loss_mlp": 1.00421262, + "epoch": 0.9752900859736668, + "flos": 23550325862400.0, + "grad_norm": 1.587350329431076, + "language_loss": 0.7893852, + "learning_rate": 6.371772998692071e-09, + "loss": 0.81133491, + "num_input_tokens_seen": 175165290, + "step": 8111, + "time_per_iteration": 2.789156198501587 + }, + { + "auxiliary_loss_clip": 0.0110652, + "auxiliary_loss_mlp": 0.0108402, + "balance_loss_clip": 1.0229156, + "balance_loss_mlp": 1.00391161, + "epoch": 0.9754103288643059, + "flos": 20303211358080.0, + "grad_norm": 2.59266182082363, + "language_loss": 0.64683783, + "learning_rate": 6.309793605927094e-09, + "loss": 0.66874325, + "num_input_tokens_seen": 175183610, + "step": 8112, + "time_per_iteration": 2.7780046463012695 + }, + { + "auxiliary_loss_clip": 0.01102049, + "auxiliary_loss_mlp": 0.01083905, + "balance_loss_clip": 1.02536082, + "balance_loss_mlp": 1.00384355, + "epoch": 0.975530571754945, + "flos": 19350680544000.0, + "grad_norm": 1.696238454139385, + "language_loss": 0.79867506, + "learning_rate": 6.248116654381297e-09, + "loss": 0.82053459, + "num_input_tokens_seen": 175202080, + "step": 8113, + "time_per_iteration": 2.6716134548187256 + }, + { + "auxiliary_loss_clip": 0.01116864, + "auxiliary_loss_mlp": 0.01083993, + "balance_loss_clip": 1.02399611, + "balance_loss_mlp": 1.00397944, + "epoch": 0.9756508146455841, + "flos": 23583399310080.0, + "grad_norm": 4.335422369652571, + "language_loss": 0.72744453, + "learning_rate": 6.186742153410751e-09, + "loss": 0.74945307, + "num_input_tokens_seen": 175221575, + "step": 8114, + "time_per_iteration": 2.7468419075012207 + }, + { + "auxiliary_loss_clip": 0.01118044, + "auxiliary_loss_mlp": 0.01084168, + "balance_loss_clip": 1.02558064, + "balance_loss_mlp": 1.00396371, + "epoch": 0.9757710575362232, + "flos": 22966921163520.0, + "grad_norm": 1.8539552421136891, + "language_loss": 0.87851387, + "learning_rate": 6.125670112326453e-09, + "loss": 0.90053594, + "num_input_tokens_seen": 175240835, + "step": 8115, + "time_per_iteration": 2.6717636585235596 + }, + { + "auxiliary_loss_clip": 0.01126693, + "auxiliary_loss_mlp": 0.01082709, + "balance_loss_clip": 1.02587152, + "balance_loss_mlp": 1.00260007, + "epoch": 0.9758913004268622, + "flos": 27966009530880.0, + "grad_norm": 1.6748463616605964, + "language_loss": 0.69880551, + "learning_rate": 6.064900540392548e-09, + "loss": 0.72089952, + "num_input_tokens_seen": 175262930, + "step": 8116, + "time_per_iteration": 2.7456343173980713 + }, + { + "auxiliary_loss_clip": 0.01113804, + "auxiliary_loss_mlp": 0.01082517, + "balance_loss_clip": 1.02372003, + "balance_loss_mlp": 1.00259876, + "epoch": 0.9760115433175014, + "flos": 22200156512640.0, + "grad_norm": 2.0831273636744045, + "language_loss": 0.7847113, + "learning_rate": 6.0044334468278835e-09, + "loss": 0.80667448, + "num_input_tokens_seen": 175282275, + "step": 8117, + "time_per_iteration": 2.7467451095581055 + }, + { + "auxiliary_loss_clip": 0.01095837, + "auxiliary_loss_mlp": 0.01084875, + "balance_loss_clip": 1.02221656, + "balance_loss_mlp": 1.00481391, + "epoch": 0.9761317862081405, + "flos": 26250736389120.0, + "grad_norm": 1.6498714066543247, + "language_loss": 0.71635807, + "learning_rate": 5.944268840805345e-09, + "loss": 0.7381652, + "num_input_tokens_seen": 175303020, + "step": 8118, + "time_per_iteration": 2.841609477996826 + }, + { + "auxiliary_loss_clip": 0.01101248, + "auxiliary_loss_mlp": 0.01085501, + "balance_loss_clip": 1.02396882, + "balance_loss_mlp": 1.00548816, + "epoch": 0.9762520290987795, + "flos": 26575440359040.0, + "grad_norm": 2.009174392194838, + "language_loss": 0.64118212, + "learning_rate": 5.88440673145163e-09, + "loss": 0.66304964, + "num_input_tokens_seen": 175324070, + "step": 8119, + "time_per_iteration": 2.8216958045959473 + }, + { + "auxiliary_loss_clip": 0.01125871, + "auxiliary_loss_mlp": 0.01085099, + "balance_loss_clip": 1.02679157, + "balance_loss_mlp": 1.00494254, + "epoch": 0.9763722719894187, + "flos": 18005036307840.0, + "grad_norm": 2.070136728960962, + "language_loss": 0.82626736, + "learning_rate": 5.824847127848142e-09, + "loss": 0.84837699, + "num_input_tokens_seen": 175342595, + "step": 8120, + "time_per_iteration": 2.6476473808288574 + }, + { + "auxiliary_loss_clip": 0.01096509, + "auxiliary_loss_mlp": 0.01084732, + "balance_loss_clip": 1.02252436, + "balance_loss_mlp": 1.00467062, + "epoch": 0.9764925148800577, + "flos": 22455660931200.0, + "grad_norm": 1.8397969947311548, + "language_loss": 0.78855288, + "learning_rate": 5.765590039029433e-09, + "loss": 0.81036532, + "num_input_tokens_seen": 175361915, + "step": 8121, + "time_per_iteration": 2.83454966545105 + }, + { + "auxiliary_loss_clip": 0.01135423, + "auxiliary_loss_mlp": 0.01084448, + "balance_loss_clip": 1.02661026, + "balance_loss_mlp": 1.00433898, + "epoch": 0.9766127577706968, + "flos": 36757084786560.0, + "grad_norm": 1.4571164689263205, + "language_loss": 0.71196806, + "learning_rate": 5.706635473985422e-09, + "loss": 0.73416674, + "num_input_tokens_seen": 175385785, + "step": 8122, + "time_per_iteration": 2.7122509479522705 + }, + { + "auxiliary_loss_clip": 0.01124371, + "auxiliary_loss_mlp": 0.01083501, + "balance_loss_clip": 1.02433896, + "balance_loss_mlp": 1.00353527, + "epoch": 0.976733000661336, + "flos": 22309971367680.0, + "grad_norm": 2.0920281451747487, + "language_loss": 0.85168254, + "learning_rate": 5.6479834416591764e-09, + "loss": 0.8737613, + "num_input_tokens_seen": 175405145, + "step": 8123, + "time_per_iteration": 3.620696783065796 + }, + { + "auxiliary_loss_clip": 0.01125293, + "auxiliary_loss_mlp": 0.00873055, + "balance_loss_clip": 1.02487719, + "balance_loss_mlp": 1.00005591, + "epoch": 0.976853243551975, + "flos": 25810938264960.0, + "grad_norm": 1.7012862027390472, + "language_loss": 0.68342412, + "learning_rate": 5.589633950947803e-09, + "loss": 0.70340759, + "num_input_tokens_seen": 175422645, + "step": 8124, + "time_per_iteration": 2.703768253326416 + }, + { + "auxiliary_loss_clip": 0.01116363, + "auxiliary_loss_mlp": 0.0108439, + "balance_loss_clip": 1.02458668, + "balance_loss_mlp": 1.00413799, + "epoch": 0.9769734864426141, + "flos": 21397445326080.0, + "grad_norm": 1.9672404508028372, + "language_loss": 0.69870293, + "learning_rate": 5.5315870107035535e-09, + "loss": 0.72071046, + "num_input_tokens_seen": 175440695, + "step": 8125, + "time_per_iteration": 3.55059552192688 + }, + { + "auxiliary_loss_clip": 0.01110199, + "auxiliary_loss_mlp": 0.01084093, + "balance_loss_clip": 1.02058661, + "balance_loss_mlp": 1.0040319, + "epoch": 0.9770937293332532, + "flos": 13990977584640.0, + "grad_norm": 1.8914373577206869, + "language_loss": 0.78710413, + "learning_rate": 5.473842629731607e-09, + "loss": 0.80904704, + "num_input_tokens_seen": 175459195, + "step": 8126, + "time_per_iteration": 2.7010676860809326 + }, + { + "auxiliary_loss_clip": 0.01118235, + "auxiliary_loss_mlp": 0.00872883, + "balance_loss_clip": 1.0249145, + "balance_loss_mlp": 1.00005984, + "epoch": 0.9772139722238923, + "flos": 17931994001280.0, + "grad_norm": 1.868462567389022, + "language_loss": 0.78212893, + "learning_rate": 5.416400816792066e-09, + "loss": 0.8020401, + "num_input_tokens_seen": 175476710, + "step": 8127, + "time_per_iteration": 2.646888494491577 + }, + { + "auxiliary_loss_clip": 0.01133605, + "auxiliary_loss_mlp": 0.01083235, + "balance_loss_clip": 1.02483976, + "balance_loss_mlp": 1.00312591, + "epoch": 0.9773342151145313, + "flos": 20446171488000.0, + "grad_norm": 3.659414531905259, + "language_loss": 0.78194821, + "learning_rate": 5.359261580598407e-09, + "loss": 0.80411667, + "num_input_tokens_seen": 175492550, + "step": 8128, + "time_per_iteration": 4.449179172515869 + }, + { + "auxiliary_loss_clip": 0.01125117, + "auxiliary_loss_mlp": 0.01083748, + "balance_loss_clip": 1.02520514, + "balance_loss_mlp": 1.00354409, + "epoch": 0.9774544580051704, + "flos": 11837306949120.0, + "grad_norm": 2.60705478155349, + "language_loss": 0.77942371, + "learning_rate": 5.302424929819027e-09, + "loss": 0.80151236, + "num_input_tokens_seen": 175506560, + "step": 8129, + "time_per_iteration": 2.675255298614502 + }, + { + "auxiliary_loss_clip": 0.01125799, + "auxiliary_loss_mlp": 0.01083795, + "balance_loss_clip": 1.02392387, + "balance_loss_mlp": 1.00359058, + "epoch": 0.9775747008958096, + "flos": 13479932833920.0, + "grad_norm": 2.0774031070581755, + "language_loss": 0.73136818, + "learning_rate": 5.24589087307592e-09, + "loss": 0.7534641, + "num_input_tokens_seen": 175524180, + "step": 8130, + "time_per_iteration": 2.629565477371216 + }, + { + "auxiliary_loss_clip": 0.01134942, + "auxiliary_loss_mlp": 0.01084021, + "balance_loss_clip": 1.02536643, + "balance_loss_mlp": 1.00386524, + "epoch": 0.9776949437864486, + "flos": 59532314042880.0, + "grad_norm": 1.4165774082109666, + "language_loss": 0.65140057, + "learning_rate": 5.189659418944891e-09, + "loss": 0.67359018, + "num_input_tokens_seen": 175554355, + "step": 8131, + "time_per_iteration": 3.061180830001831 + }, + { + "auxiliary_loss_clip": 0.01135104, + "auxiliary_loss_mlp": 0.0108512, + "balance_loss_clip": 1.02586293, + "balance_loss_mlp": 1.00505888, + "epoch": 0.9778151866770877, + "flos": 21178605715200.0, + "grad_norm": 1.9051972590400286, + "language_loss": 0.78292394, + "learning_rate": 5.133730575956674e-09, + "loss": 0.80512613, + "num_input_tokens_seen": 175574025, + "step": 8132, + "time_per_iteration": 2.6247313022613525 + }, + { + "auxiliary_loss_clip": 0.01115957, + "auxiliary_loss_mlp": 0.01083912, + "balance_loss_clip": 1.02401543, + "balance_loss_mlp": 1.00380313, + "epoch": 0.9779354295677268, + "flos": 20886795624960.0, + "grad_norm": 1.8240110175125173, + "language_loss": 0.71954703, + "learning_rate": 5.0781043525953696e-09, + "loss": 0.74154574, + "num_input_tokens_seen": 175592090, + "step": 8133, + "time_per_iteration": 2.717252016067505 + }, + { + "auxiliary_loss_clip": 0.01115051, + "auxiliary_loss_mlp": 0.01083058, + "balance_loss_clip": 1.02423513, + "balance_loss_mlp": 1.003093, + "epoch": 0.9780556724583659, + "flos": 23440618748160.0, + "grad_norm": 1.6787532429346164, + "language_loss": 0.73472881, + "learning_rate": 5.0227807572995605e-09, + "loss": 0.75670993, + "num_input_tokens_seen": 175614065, + "step": 8134, + "time_per_iteration": 2.8615939617156982 + }, + { + "auxiliary_loss_clip": 0.01115628, + "auxiliary_loss_mlp": 0.01084338, + "balance_loss_clip": 1.02369881, + "balance_loss_mlp": 1.00418115, + "epoch": 0.9781759153490049, + "flos": 20923244951040.0, + "grad_norm": 2.0966214674522448, + "language_loss": 0.67545348, + "learning_rate": 4.967759798461646e-09, + "loss": 0.69745314, + "num_input_tokens_seen": 175632410, + "step": 8135, + "time_per_iteration": 2.784557580947876 + }, + { + "auxiliary_loss_clip": 0.01134991, + "auxiliary_loss_mlp": 0.01083707, + "balance_loss_clip": 1.02594316, + "balance_loss_mlp": 1.00359797, + "epoch": 0.9782961582396441, + "flos": 28293191539200.0, + "grad_norm": 2.049379682379879, + "language_loss": 0.75084859, + "learning_rate": 4.913041484428282e-09, + "loss": 0.77303553, + "num_input_tokens_seen": 175652885, + "step": 8136, + "time_per_iteration": 2.6420915126800537 + }, + { + "auxiliary_loss_clip": 0.01126057, + "auxiliary_loss_mlp": 0.01084008, + "balance_loss_clip": 1.0251441, + "balance_loss_mlp": 1.00408971, + "epoch": 0.9784164011302832, + "flos": 25552955808000.0, + "grad_norm": 1.6804887198533458, + "language_loss": 0.74321848, + "learning_rate": 4.858625823500384e-09, + "loss": 0.76531911, + "num_input_tokens_seen": 175670585, + "step": 8137, + "time_per_iteration": 2.6645472049713135 + }, + { + "auxiliary_loss_clip": 0.01125947, + "auxiliary_loss_mlp": 0.01084091, + "balance_loss_clip": 1.02468371, + "balance_loss_mlp": 1.00398278, + "epoch": 0.9785366440209222, + "flos": 29965945956480.0, + "grad_norm": 1.8638853886571114, + "language_loss": 0.73353314, + "learning_rate": 4.80451282393246e-09, + "loss": 0.75563353, + "num_input_tokens_seen": 175690570, + "step": 8138, + "time_per_iteration": 2.753812551498413 + }, + { + "auxiliary_loss_clip": 0.01093178, + "auxiliary_loss_mlp": 0.01084124, + "balance_loss_clip": 1.02370024, + "balance_loss_mlp": 1.00396729, + "epoch": 0.9786568869115614, + "flos": 32343591847680.0, + "grad_norm": 1.9510460641184781, + "language_loss": 0.67266965, + "learning_rate": 4.750702493933722e-09, + "loss": 0.69444263, + "num_input_tokens_seen": 175710455, + "step": 8139, + "time_per_iteration": 2.830578088760376 + }, + { + "auxiliary_loss_clip": 0.01111774, + "auxiliary_loss_mlp": 0.00872915, + "balance_loss_clip": 1.02145576, + "balance_loss_mlp": 1.00009608, + "epoch": 0.9787771298022004, + "flos": 23331414424320.0, + "grad_norm": 1.7542433047210688, + "language_loss": 0.85138738, + "learning_rate": 4.697194841666974e-09, + "loss": 0.8712343, + "num_input_tokens_seen": 175729380, + "step": 8140, + "time_per_iteration": 2.872053861618042 + }, + { + "auxiliary_loss_clip": 0.01123932, + "auxiliary_loss_mlp": 0.01083456, + "balance_loss_clip": 1.02354276, + "balance_loss_mlp": 1.00325179, + "epoch": 0.9788973726928395, + "flos": 21468548298240.0, + "grad_norm": 2.1671019908276916, + "language_loss": 0.81638694, + "learning_rate": 4.6439898752492764e-09, + "loss": 0.83846092, + "num_input_tokens_seen": 175749520, + "step": 8141, + "time_per_iteration": 2.640949249267578 + }, + { + "auxiliary_loss_clip": 0.01104899, + "auxiliary_loss_mlp": 0.0087274, + "balance_loss_clip": 1.01786435, + "balance_loss_mlp": 1.00098121, + "epoch": 0.9790176155834787, + "flos": 68897459439360.0, + "grad_norm": 0.7463005880320623, + "language_loss": 0.63701659, + "learning_rate": 4.591087602751731e-09, + "loss": 0.656793, + "num_input_tokens_seen": 175811380, + "step": 8142, + "time_per_iteration": 3.3614299297332764 + }, + { + "auxiliary_loss_clip": 0.01124211, + "auxiliary_loss_mlp": 0.01084282, + "balance_loss_clip": 1.02375412, + "balance_loss_mlp": 1.00426912, + "epoch": 0.9791378584741177, + "flos": 21430877909760.0, + "grad_norm": 1.6828551523933932, + "language_loss": 0.72113252, + "learning_rate": 4.538488032199916e-09, + "loss": 0.74321747, + "num_input_tokens_seen": 175829480, + "step": 8143, + "time_per_iteration": 2.74782133102417 + }, + { + "auxiliary_loss_clip": 0.01125958, + "auxiliary_loss_mlp": 0.01083891, + "balance_loss_clip": 1.02436447, + "balance_loss_mlp": 1.00378203, + "epoch": 0.9792581013647568, + "flos": 20153032594560.0, + "grad_norm": 2.0079883359934376, + "language_loss": 0.69070542, + "learning_rate": 4.486191171572784e-09, + "loss": 0.7128039, + "num_input_tokens_seen": 175846750, + "step": 8144, + "time_per_iteration": 2.5829615592956543 + }, + { + "auxiliary_loss_clip": 0.01126666, + "auxiliary_loss_mlp": 0.01082854, + "balance_loss_clip": 1.02568042, + "balance_loss_mlp": 1.00298357, + "epoch": 0.9793783442553959, + "flos": 23728191033600.0, + "grad_norm": 1.4522058332419714, + "language_loss": 0.77565271, + "learning_rate": 4.434197028803766e-09, + "loss": 0.79774791, + "num_input_tokens_seen": 175865975, + "step": 8145, + "time_per_iteration": 2.7194275856018066 + }, + { + "auxiliary_loss_clip": 0.01108373, + "auxiliary_loss_mlp": 0.01084629, + "balance_loss_clip": 1.02402568, + "balance_loss_mlp": 1.00452065, + "epoch": 0.979498587146035, + "flos": 23038742407680.0, + "grad_norm": 1.9484341400558016, + "language_loss": 0.81847024, + "learning_rate": 4.3825056117805514e-09, + "loss": 0.84040022, + "num_input_tokens_seen": 175881860, + "step": 8146, + "time_per_iteration": 2.7333943843841553 + }, + { + "auxiliary_loss_clip": 0.01134417, + "auxiliary_loss_mlp": 0.01084656, + "balance_loss_clip": 1.02501297, + "balance_loss_mlp": 1.0045476, + "epoch": 0.979618830036674, + "flos": 14318841951360.0, + "grad_norm": 3.0019023751632083, + "language_loss": 0.79557562, + "learning_rate": 4.331116928344425e-09, + "loss": 0.81776631, + "num_input_tokens_seen": 175898175, + "step": 8147, + "time_per_iteration": 2.613701343536377 + }, + { + "auxiliary_loss_clip": 0.01117785, + "auxiliary_loss_mlp": 0.00872932, + "balance_loss_clip": 1.02501202, + "balance_loss_mlp": 1.0000453, + "epoch": 0.9797390729273132, + "flos": 16727514215040.0, + "grad_norm": 1.8004650962403694, + "language_loss": 0.62250745, + "learning_rate": 4.28003098629115e-09, + "loss": 0.64241463, + "num_input_tokens_seen": 175914310, + "step": 8148, + "time_per_iteration": 2.669512987136841 + }, + { + "auxiliary_loss_clip": 0.01106977, + "auxiliary_loss_mlp": 0.01082679, + "balance_loss_clip": 1.0222801, + "balance_loss_mlp": 1.00266576, + "epoch": 0.9798593158179523, + "flos": 24532661986560.0, + "grad_norm": 2.5770038008116387, + "language_loss": 0.7865625, + "learning_rate": 4.229247793370305e-09, + "loss": 0.80845916, + "num_input_tokens_seen": 175933435, + "step": 8149, + "time_per_iteration": 3.7643496990203857 + }, + { + "auxiliary_loss_clip": 0.01136701, + "auxiliary_loss_mlp": 0.01084979, + "balance_loss_clip": 1.02736151, + "balance_loss_mlp": 1.00491798, + "epoch": 0.9799795587085913, + "flos": 27308808339840.0, + "grad_norm": 1.5399980961282846, + "language_loss": 0.7069639, + "learning_rate": 4.178767357285951e-09, + "loss": 0.72918081, + "num_input_tokens_seen": 175955065, + "step": 8150, + "time_per_iteration": 3.5712995529174805 + }, + { + "auxiliary_loss_clip": 0.01125925, + "auxiliary_loss_mlp": 0.00872747, + "balance_loss_clip": 1.02585864, + "balance_loss_mlp": 1.00014687, + "epoch": 0.9800998015992305, + "flos": 26286575184000.0, + "grad_norm": 2.4945875395890496, + "language_loss": 0.71891689, + "learning_rate": 4.128589685695516e-09, + "loss": 0.73890352, + "num_input_tokens_seen": 175975490, + "step": 8151, + "time_per_iteration": 2.7042696475982666 + }, + { + "auxiliary_loss_clip": 0.01136736, + "auxiliary_loss_mlp": 0.01084505, + "balance_loss_clip": 1.0271256, + "balance_loss_mlp": 1.00434864, + "epoch": 0.9802200444898695, + "flos": 16723635546240.0, + "grad_norm": 2.170140286583271, + "language_loss": 0.84385091, + "learning_rate": 4.078714786211135e-09, + "loss": 0.8660633, + "num_input_tokens_seen": 175991340, + "step": 8152, + "time_per_iteration": 2.627173900604248 + }, + { + "auxiliary_loss_clip": 0.0112391, + "auxiliary_loss_mlp": 0.01084004, + "balance_loss_clip": 1.02436066, + "balance_loss_mlp": 1.00399125, + "epoch": 0.9803402873805086, + "flos": 24900459298560.0, + "grad_norm": 1.6937494395364492, + "language_loss": 0.76764971, + "learning_rate": 4.029142666398977e-09, + "loss": 0.78972888, + "num_input_tokens_seen": 176011505, + "step": 8153, + "time_per_iteration": 2.862809658050537 + }, + { + "auxiliary_loss_clip": 0.01134462, + "auxiliary_loss_mlp": 0.01084045, + "balance_loss_clip": 1.02552867, + "balance_loss_mlp": 1.00398386, + "epoch": 0.9804605302711478, + "flos": 22564937082240.0, + "grad_norm": 2.6170712944789, + "language_loss": 0.80017608, + "learning_rate": 3.979873333778805e-09, + "loss": 0.82236117, + "num_input_tokens_seen": 176029680, + "step": 8154, + "time_per_iteration": 4.326477289199829 + }, + { + "auxiliary_loss_clip": 0.0111663, + "auxiliary_loss_mlp": 0.01083109, + "balance_loss_clip": 1.02413845, + "balance_loss_mlp": 1.0030483, + "epoch": 0.9805807731617868, + "flos": 38905368382080.0, + "grad_norm": 1.894528186663901, + "language_loss": 0.73813426, + "learning_rate": 3.930906795824862e-09, + "loss": 0.76013172, + "num_input_tokens_seen": 176050355, + "step": 8155, + "time_per_iteration": 2.7698659896850586 + }, + { + "auxiliary_loss_clip": 0.01124316, + "auxiliary_loss_mlp": 0.01084626, + "balance_loss_clip": 1.02455771, + "balance_loss_mlp": 1.00451696, + "epoch": 0.9807010160524259, + "flos": 17821999578240.0, + "grad_norm": 1.822839494903396, + "language_loss": 0.76291919, + "learning_rate": 3.882243059965207e-09, + "loss": 0.78500855, + "num_input_tokens_seen": 176068070, + "step": 8156, + "time_per_iteration": 2.5089614391326904 + }, + { + "auxiliary_loss_clip": 0.01126113, + "auxiliary_loss_mlp": 0.01084039, + "balance_loss_clip": 1.02472401, + "balance_loss_mlp": 1.00383544, + "epoch": 0.980821258943065, + "flos": 13552975140480.0, + "grad_norm": 2.596427198348876, + "language_loss": 0.65823877, + "learning_rate": 3.833882133582156e-09, + "loss": 0.68034023, + "num_input_tokens_seen": 176083730, + "step": 8157, + "time_per_iteration": 2.5269320011138916 + }, + { + "auxiliary_loss_clip": 0.01127612, + "auxiliary_loss_mlp": 0.01084148, + "balance_loss_clip": 1.0262301, + "balance_loss_mlp": 1.00413465, + "epoch": 0.9809415018337041, + "flos": 21689794120320.0, + "grad_norm": 1.5503754214697818, + "language_loss": 0.77930713, + "learning_rate": 3.785824024012285e-09, + "loss": 0.80142474, + "num_input_tokens_seen": 176102730, + "step": 8158, + "time_per_iteration": 2.621431827545166 + }, + { + "auxiliary_loss_clip": 0.01107175, + "auxiliary_loss_mlp": 0.01085348, + "balance_loss_clip": 1.02292705, + "balance_loss_mlp": 1.00523925, + "epoch": 0.9810617447243432, + "flos": 23294857357440.0, + "grad_norm": 1.6125933773818606, + "language_loss": 0.78395271, + "learning_rate": 3.738068738545541e-09, + "loss": 0.80587786, + "num_input_tokens_seen": 176121815, + "step": 8159, + "time_per_iteration": 2.685073137283325 + }, + { + "auxiliary_loss_clip": 0.01126265, + "auxiliary_loss_mlp": 0.0108393, + "balance_loss_clip": 1.02541482, + "balance_loss_mlp": 1.00372565, + "epoch": 0.9811819876149822, + "flos": 18332038748160.0, + "grad_norm": 2.091729005130914, + "language_loss": 0.7884084, + "learning_rate": 3.6906162844265733e-09, + "loss": 0.81051028, + "num_input_tokens_seen": 176138900, + "step": 8160, + "time_per_iteration": 2.5777618885040283 + }, + { + "auxiliary_loss_clip": 0.01116108, + "auxiliary_loss_mlp": 0.01083875, + "balance_loss_clip": 1.02432966, + "balance_loss_mlp": 1.00381422, + "epoch": 0.9813022305056214, + "flos": 22601961025920.0, + "grad_norm": 1.8328726470755639, + "language_loss": 0.70556164, + "learning_rate": 3.643466668853845e-09, + "loss": 0.72756147, + "num_input_tokens_seen": 176156925, + "step": 8161, + "time_per_iteration": 2.7099967002868652 + }, + { + "auxiliary_loss_clip": 0.01118333, + "auxiliary_loss_mlp": 0.0108455, + "balance_loss_clip": 1.02569079, + "balance_loss_mlp": 1.00453699, + "epoch": 0.9814224733962604, + "flos": 25413335642880.0, + "grad_norm": 2.4204739313685204, + "language_loss": 0.75473809, + "learning_rate": 3.59661989898008e-09, + "loss": 0.7767669, + "num_input_tokens_seen": 176177980, + "step": 8162, + "time_per_iteration": 2.746204376220703 + }, + { + "auxiliary_loss_clip": 0.01104796, + "auxiliary_loss_mlp": 0.01083263, + "balance_loss_clip": 1.02290297, + "balance_loss_mlp": 1.00315428, + "epoch": 0.9815427162868995, + "flos": 25007185584000.0, + "grad_norm": 1.7470801589110083, + "language_loss": 0.76534712, + "learning_rate": 3.5500759819115934e-09, + "loss": 0.78722775, + "num_input_tokens_seen": 176198345, + "step": 8163, + "time_per_iteration": 2.7363438606262207 + }, + { + "auxiliary_loss_clip": 0.01135408, + "auxiliary_loss_mlp": 0.01084933, + "balance_loss_clip": 1.02625716, + "balance_loss_mlp": 1.00491977, + "epoch": 0.9816629591775387, + "flos": 20662604887680.0, + "grad_norm": 1.811004306535642, + "language_loss": 0.81044662, + "learning_rate": 3.5038349247094034e-09, + "loss": 0.83265001, + "num_input_tokens_seen": 176215605, + "step": 8164, + "time_per_iteration": 2.5636417865753174 + }, + { + "auxiliary_loss_clip": 0.01117003, + "auxiliary_loss_mlp": 0.01084987, + "balance_loss_clip": 1.02381527, + "balance_loss_mlp": 1.00483108, + "epoch": 0.9817832020681777, + "flos": 17712220636800.0, + "grad_norm": 2.2445599773904545, + "language_loss": 0.77236342, + "learning_rate": 3.4578967343878994e-09, + "loss": 0.79438335, + "num_input_tokens_seen": 176231810, + "step": 8165, + "time_per_iteration": 2.668567180633545 + }, + { + "auxiliary_loss_clip": 0.01114618, + "auxiliary_loss_mlp": 0.01084529, + "balance_loss_clip": 1.02397668, + "balance_loss_mlp": 1.00437295, + "epoch": 0.9819034449588168, + "flos": 22530032040960.0, + "grad_norm": 6.075720845664341, + "language_loss": 0.81042355, + "learning_rate": 3.4122614179161733e-09, + "loss": 0.83241498, + "num_input_tokens_seen": 176251770, + "step": 8166, + "time_per_iteration": 2.7522921562194824 + }, + { + "auxiliary_loss_clip": 0.01103011, + "auxiliary_loss_mlp": 0.01083628, + "balance_loss_clip": 1.02080011, + "balance_loss_mlp": 1.00361514, + "epoch": 0.9820236878494559, + "flos": 20011221699840.0, + "grad_norm": 1.6131392718128506, + "language_loss": 0.78246933, + "learning_rate": 3.36692898221691e-09, + "loss": 0.80433571, + "num_input_tokens_seen": 176270135, + "step": 8167, + "time_per_iteration": 2.7739763259887695 + }, + { + "auxiliary_loss_clip": 0.01126832, + "auxiliary_loss_mlp": 0.01083894, + "balance_loss_clip": 1.02578878, + "balance_loss_mlp": 1.00383341, + "epoch": 0.982143930740095, + "flos": 18807316531200.0, + "grad_norm": 1.7674594262275793, + "language_loss": 0.73463261, + "learning_rate": 3.3218994341668305e-09, + "loss": 0.75673985, + "num_input_tokens_seen": 176289065, + "step": 8168, + "time_per_iteration": 2.697201728820801 + }, + { + "auxiliary_loss_clip": 0.01135858, + "auxiliary_loss_mlp": 0.0108547, + "balance_loss_clip": 1.02714324, + "balance_loss_mlp": 1.00536168, + "epoch": 0.982264173630734, + "flos": 26578026138240.0, + "grad_norm": 1.725952795165885, + "language_loss": 0.75581288, + "learning_rate": 3.2771727805971373e-09, + "loss": 0.77802622, + "num_input_tokens_seen": 176310450, + "step": 8169, + "time_per_iteration": 2.671473741531372 + }, + { + "auxiliary_loss_clip": 0.01098366, + "auxiliary_loss_mlp": 0.01084563, + "balance_loss_clip": 1.02208948, + "balance_loss_mlp": 1.00440693, + "epoch": 0.9823844165213732, + "flos": 22014462176640.0, + "grad_norm": 1.821862268060848, + "language_loss": 0.77022702, + "learning_rate": 3.232749028292847e-09, + "loss": 0.79205626, + "num_input_tokens_seen": 176327415, + "step": 8170, + "time_per_iteration": 2.863647222518921 + }, + { + "auxiliary_loss_clip": 0.01134955, + "auxiliary_loss_mlp": 0.01083223, + "balance_loss_clip": 1.02534211, + "balance_loss_mlp": 1.00311422, + "epoch": 0.9825046594120123, + "flos": 21908166854400.0, + "grad_norm": 1.8378282034444162, + "language_loss": 0.88104546, + "learning_rate": 3.188628183992792e-09, + "loss": 0.90322721, + "num_input_tokens_seen": 176347680, + "step": 8171, + "time_per_iteration": 2.568182945251465 + }, + { + "auxiliary_loss_clip": 0.01104922, + "auxiliary_loss_mlp": 0.010789, + "balance_loss_clip": 1.01751208, + "balance_loss_mlp": 0.99993581, + "epoch": 0.9826249023026513, + "flos": 59494610718720.0, + "grad_norm": 0.7351712973929722, + "language_loss": 0.62570286, + "learning_rate": 3.1448102543902844e-09, + "loss": 0.64754105, + "num_input_tokens_seen": 176411595, + "step": 8172, + "time_per_iteration": 3.2235095500946045 + }, + { + "auxiliary_loss_clip": 0.01118214, + "auxiliary_loss_mlp": 0.01085226, + "balance_loss_clip": 1.0260278, + "balance_loss_mlp": 1.00511718, + "epoch": 0.9827451451932905, + "flos": 16071031296000.0, + "grad_norm": 2.0320557621169333, + "language_loss": 0.67337137, + "learning_rate": 3.1012952461324515e-09, + "loss": 0.69540584, + "num_input_tokens_seen": 176430570, + "step": 8173, + "time_per_iteration": 2.6632561683654785 + }, + { + "auxiliary_loss_clip": 0.01124927, + "auxiliary_loss_mlp": 0.01084509, + "balance_loss_clip": 1.02549803, + "balance_loss_mlp": 1.00444853, + "epoch": 0.9828653880839295, + "flos": 20262775622400.0, + "grad_norm": 2.3212283785855363, + "language_loss": 0.73593193, + "learning_rate": 3.0580831658204575e-09, + "loss": 0.7580263, + "num_input_tokens_seen": 176448150, + "step": 8174, + "time_per_iteration": 2.647106170654297 + }, + { + "auxiliary_loss_clip": 0.0112453, + "auxiliary_loss_mlp": 0.01085077, + "balance_loss_clip": 1.02489066, + "balance_loss_mlp": 1.00501561, + "epoch": 0.9829856309745686, + "flos": 21616141282560.0, + "grad_norm": 1.5438709096237662, + "language_loss": 0.78012908, + "learning_rate": 3.015174020009281e-09, + "loss": 0.80222511, + "num_input_tokens_seen": 176467475, + "step": 8175, + "time_per_iteration": 4.608840703964233 + }, + { + "auxiliary_loss_clip": 0.01110064, + "auxiliary_loss_mlp": 0.01083422, + "balance_loss_clip": 1.0249722, + "balance_loss_mlp": 1.00340915, + "epoch": 0.9831058738652078, + "flos": 23764209396480.0, + "grad_norm": 1.9459790522631695, + "language_loss": 0.74977863, + "learning_rate": 2.9725678152086043e-09, + "loss": 0.7717135, + "num_input_tokens_seen": 176486045, + "step": 8176, + "time_per_iteration": 2.737212657928467 + }, + { + "auxiliary_loss_clip": 0.01109741, + "auxiliary_loss_mlp": 0.01084227, + "balance_loss_clip": 1.02383828, + "balance_loss_mlp": 1.00407052, + "epoch": 0.9832261167558468, + "flos": 11320911072000.0, + "grad_norm": 2.3891242471098018, + "language_loss": 0.82568526, + "learning_rate": 2.930264557881257e-09, + "loss": 0.84762496, + "num_input_tokens_seen": 176501230, + "step": 8177, + "time_per_iteration": 2.71476674079895 + }, + { + "auxiliary_loss_clip": 0.01112837, + "auxiliary_loss_mlp": 0.01079167, + "balance_loss_clip": 1.01767421, + "balance_loss_mlp": 1.00020242, + "epoch": 0.9833463596464859, + "flos": 60000304343040.0, + "grad_norm": 0.8241824267854638, + "language_loss": 0.58186567, + "learning_rate": 2.8882642544452163e-09, + "loss": 0.60378563, + "num_input_tokens_seen": 176565955, + "step": 8178, + "time_per_iteration": 3.260908842086792 + }, + { + "auxiliary_loss_clip": 0.01115492, + "auxiliary_loss_mlp": 0.01083568, + "balance_loss_clip": 1.02370644, + "balance_loss_mlp": 1.00345898, + "epoch": 0.983466602537125, + "flos": 13626699805440.0, + "grad_norm": 2.1759631338596015, + "language_loss": 0.7428813, + "learning_rate": 2.8465669112716083e-09, + "loss": 0.7648719, + "num_input_tokens_seen": 176583480, + "step": 8179, + "time_per_iteration": 2.6816952228546143 + }, + { + "auxiliary_loss_clip": 0.01127873, + "auxiliary_loss_mlp": 0.0087295, + "balance_loss_clip": 1.02645516, + "balance_loss_mlp": 1.00013137, + "epoch": 0.9835868454277641, + "flos": 22926844563840.0, + "grad_norm": 2.2621102973887353, + "language_loss": 0.762568, + "learning_rate": 2.8051725346858177e-09, + "loss": 0.7825762, + "num_input_tokens_seen": 176603740, + "step": 8180, + "time_per_iteration": 4.46120285987854 + }, + { + "auxiliary_loss_clip": 0.01134669, + "auxiliary_loss_mlp": 0.01083469, + "balance_loss_clip": 1.02500188, + "balance_loss_mlp": 1.00331259, + "epoch": 0.9837070883184031, + "flos": 27673409341440.0, + "grad_norm": 1.9332356856472481, + "language_loss": 0.71195221, + "learning_rate": 2.7640811309674883e-09, + "loss": 0.7341336, + "num_input_tokens_seen": 176623240, + "step": 8181, + "time_per_iteration": 2.7488081455230713 + }, + { + "auxiliary_loss_clip": 0.011002, + "auxiliary_loss_mlp": 0.01084262, + "balance_loss_clip": 1.02355051, + "balance_loss_mlp": 1.00420094, + "epoch": 0.9838273312090423, + "flos": 29241951425280.0, + "grad_norm": 4.922807756327656, + "language_loss": 0.80749363, + "learning_rate": 2.7232927063498557e-09, + "loss": 0.82933819, + "num_input_tokens_seen": 176643615, + "step": 8182, + "time_per_iteration": 2.800288438796997 + }, + { + "auxiliary_loss_clip": 0.01124981, + "auxiliary_loss_mlp": 0.01084436, + "balance_loss_clip": 1.0243628, + "balance_loss_mlp": 1.00423229, + "epoch": 0.9839475740996814, + "flos": 40110207304320.0, + "grad_norm": 1.8012045495455187, + "language_loss": 0.68914342, + "learning_rate": 2.682807267020859e-09, + "loss": 0.71123761, + "num_input_tokens_seen": 176666375, + "step": 8183, + "time_per_iteration": 2.8304646015167236 + }, + { + "auxiliary_loss_clip": 0.01124146, + "auxiliary_loss_mlp": 0.0108383, + "balance_loss_clip": 1.0243187, + "balance_loss_mlp": 1.00367379, + "epoch": 0.9840678169903204, + "flos": 24169389788160.0, + "grad_norm": 1.7017291813194602, + "language_loss": 0.62470061, + "learning_rate": 2.642624819121808e-09, + "loss": 0.64678037, + "num_input_tokens_seen": 176686525, + "step": 8184, + "time_per_iteration": 2.681817054748535 + }, + { + "auxiliary_loss_clip": 0.01114899, + "auxiliary_loss_mlp": 0.01085491, + "balance_loss_clip": 1.02423739, + "balance_loss_mlp": 1.00538206, + "epoch": 0.9841880598809596, + "flos": 14684484447360.0, + "grad_norm": 1.9036367372022995, + "language_loss": 0.61585099, + "learning_rate": 2.6027453687487154e-09, + "loss": 0.63785493, + "num_input_tokens_seen": 176703615, + "step": 8185, + "time_per_iteration": 2.763608694076538 + }, + { + "auxiliary_loss_clip": 0.01114452, + "auxiliary_loss_mlp": 0.01083382, + "balance_loss_clip": 1.0239315, + "balance_loss_mlp": 1.00327301, + "epoch": 0.9843083027715986, + "flos": 22344768668160.0, + "grad_norm": 2.248788175351021, + "language_loss": 0.53796577, + "learning_rate": 2.5631689219509643e-09, + "loss": 0.55994415, + "num_input_tokens_seen": 176722295, + "step": 8186, + "time_per_iteration": 2.739767551422119 + }, + { + "auxiliary_loss_clip": 0.01100006, + "auxiliary_loss_mlp": 0.01084098, + "balance_loss_clip": 1.02576602, + "balance_loss_mlp": 1.00417972, + "epoch": 0.9844285456622377, + "flos": 21800111765760.0, + "grad_norm": 1.6299394068535835, + "language_loss": 0.83487833, + "learning_rate": 2.523895484732197e-09, + "loss": 0.85671932, + "num_input_tokens_seen": 176741750, + "step": 8187, + "time_per_iteration": 2.7186858654022217 + }, + { + "auxiliary_loss_clip": 0.01126985, + "auxiliary_loss_mlp": 0.01084254, + "balance_loss_clip": 1.0251112, + "balance_loss_mlp": 1.00404954, + "epoch": 0.9845487885528769, + "flos": 18035380321920.0, + "grad_norm": 1.8562564059837585, + "language_loss": 0.74782807, + "learning_rate": 2.4849250630505357e-09, + "loss": 0.7699405, + "num_input_tokens_seen": 176759995, + "step": 8188, + "time_per_iteration": 2.6799745559692383 + }, + { + "auxiliary_loss_clip": 0.01071175, + "auxiliary_loss_mlp": 0.01084501, + "balance_loss_clip": 1.02092242, + "balance_loss_mlp": 1.00443959, + "epoch": 0.9846690314435159, + "flos": 25228610974080.0, + "grad_norm": 1.7070429484081913, + "language_loss": 0.73426104, + "learning_rate": 2.4462576628172528e-09, + "loss": 0.75581777, + "num_input_tokens_seen": 176778625, + "step": 8189, + "time_per_iteration": 2.8990776538848877 + }, + { + "auxiliary_loss_clip": 0.01124142, + "auxiliary_loss_mlp": 0.01084539, + "balance_loss_clip": 1.02414858, + "balance_loss_mlp": 1.00438261, + "epoch": 0.984789274334155, + "flos": 18552171248640.0, + "grad_norm": 1.7512843383958259, + "language_loss": 0.74084789, + "learning_rate": 2.407893289898766e-09, + "loss": 0.76293468, + "num_input_tokens_seen": 176797655, + "step": 8190, + "time_per_iteration": 2.673654556274414 + }, + { + "auxiliary_loss_clip": 0.01104953, + "auxiliary_loss_mlp": 0.01084171, + "balance_loss_clip": 1.02231765, + "balance_loss_mlp": 1.00415802, + "epoch": 0.984909517224794, + "flos": 27345437233920.0, + "grad_norm": 1.73182477536273, + "language_loss": 0.83768272, + "learning_rate": 2.3698319501144202e-09, + "loss": 0.85957396, + "num_input_tokens_seen": 176818640, + "step": 8191, + "time_per_iteration": 2.7702035903930664 + }, + { + "auxiliary_loss_clip": 0.01109994, + "auxiliary_loss_mlp": 0.01083746, + "balance_loss_clip": 1.02539206, + "balance_loss_mlp": 1.00358927, + "epoch": 0.9850297601154332, + "flos": 18734058743040.0, + "grad_norm": 1.5064289663928512, + "language_loss": 0.73367894, + "learning_rate": 2.3320736492382644e-09, + "loss": 0.75561637, + "num_input_tokens_seen": 176837475, + "step": 8192, + "time_per_iteration": 2.659393787384033 + }, + { + "auxiliary_loss_clip": 0.01134155, + "auxiliary_loss_mlp": 0.01084307, + "balance_loss_clip": 1.02557087, + "balance_loss_mlp": 1.00419855, + "epoch": 0.9851500030060723, + "flos": 22308247514880.0, + "grad_norm": 1.5176057298007688, + "language_loss": 0.67859495, + "learning_rate": 2.29461839299816e-09, + "loss": 0.70077956, + "num_input_tokens_seen": 176857190, + "step": 8193, + "time_per_iteration": 2.6508243083953857 + }, + { + "auxiliary_loss_clip": 0.01091595, + "auxiliary_loss_mlp": 0.01083368, + "balance_loss_clip": 1.02514267, + "balance_loss_mlp": 1.0033071, + "epoch": 0.9852702458967113, + "flos": 26353691746560.0, + "grad_norm": 1.580244911413591, + "language_loss": 0.79837865, + "learning_rate": 2.257466187076229e-09, + "loss": 0.82012832, + "num_input_tokens_seen": 176876395, + "step": 8194, + "time_per_iteration": 2.9268510341644287 + }, + { + "auxiliary_loss_clip": 0.01127707, + "auxiliary_loss_mlp": 0.00872913, + "balance_loss_clip": 1.02604771, + "balance_loss_mlp": 1.00006282, + "epoch": 0.9853904887873505, + "flos": 20883599314560.0, + "grad_norm": 1.793679153965137, + "language_loss": 0.71263587, + "learning_rate": 2.2206170371081854e-09, + "loss": 0.73264205, + "num_input_tokens_seen": 176894980, + "step": 8195, + "time_per_iteration": 2.699820041656494 + }, + { + "auxiliary_loss_clip": 0.01119025, + "auxiliary_loss_mlp": 0.01083962, + "balance_loss_clip": 1.02509785, + "balance_loss_mlp": 1.00385356, + "epoch": 0.9855107316779895, + "flos": 25263444188160.0, + "grad_norm": 1.5887590687623725, + "language_loss": 0.84740752, + "learning_rate": 2.1840709486842247e-09, + "loss": 0.86943746, + "num_input_tokens_seen": 176914600, + "step": 8196, + "time_per_iteration": 2.7489800453186035 + }, + { + "auxiliary_loss_clip": 0.01110121, + "auxiliary_loss_mlp": 0.01084775, + "balance_loss_clip": 1.02350092, + "balance_loss_mlp": 1.00457048, + "epoch": 0.9856309745686286, + "flos": 19062102677760.0, + "grad_norm": 2.0036155015455503, + "language_loss": 0.78974873, + "learning_rate": 2.1478279273481335e-09, + "loss": 0.81169772, + "num_input_tokens_seen": 176933085, + "step": 8197, + "time_per_iteration": 2.711167812347412 + }, + { + "auxiliary_loss_clip": 0.01124784, + "auxiliary_loss_mlp": 0.01083623, + "balance_loss_clip": 1.02579927, + "balance_loss_mlp": 1.00356197, + "epoch": 0.9857512174592677, + "flos": 34130758060800.0, + "grad_norm": 2.0382952365021927, + "language_loss": 0.80005252, + "learning_rate": 2.1118879785981815e-09, + "loss": 0.82213658, + "num_input_tokens_seen": 176953225, + "step": 8198, + "time_per_iteration": 2.768995523452759 + }, + { + "auxiliary_loss_clip": 0.01109989, + "auxiliary_loss_mlp": 0.01083352, + "balance_loss_clip": 1.02366257, + "balance_loss_mlp": 1.00338662, + "epoch": 0.9858714603499068, + "flos": 25994693266560.0, + "grad_norm": 2.5889339692971687, + "language_loss": 0.79644728, + "learning_rate": 2.0762511078862288e-09, + "loss": 0.81838071, + "num_input_tokens_seen": 176973570, + "step": 8199, + "time_per_iteration": 2.7595837116241455 + }, + { + "auxiliary_loss_clip": 0.01100116, + "auxiliary_loss_mlp": 0.01083426, + "balance_loss_clip": 1.02398896, + "balance_loss_mlp": 1.00326967, + "epoch": 0.9859917032405459, + "flos": 23696230907520.0, + "grad_norm": 1.7859010717308141, + "language_loss": 0.64782524, + "learning_rate": 2.0409173206186183e-09, + "loss": 0.66966069, + "num_input_tokens_seen": 176992810, + "step": 8200, + "time_per_iteration": 3.6724276542663574 + }, + { + "auxiliary_loss_clip": 0.01100562, + "auxiliary_loss_mlp": 0.0108466, + "balance_loss_clip": 1.02416897, + "balance_loss_mlp": 1.00464702, + "epoch": 0.986111946131185, + "flos": 19938287134080.0, + "grad_norm": 1.8443685526636557, + "language_loss": 0.87028074, + "learning_rate": 2.0058866221550617e-09, + "loss": 0.892133, + "num_input_tokens_seen": 177011050, + "step": 8201, + "time_per_iteration": 3.6663594245910645 + }, + { + "auxiliary_loss_clip": 0.01134494, + "auxiliary_loss_mlp": 0.01084386, + "balance_loss_clip": 1.02514601, + "balance_loss_mlp": 1.00427771, + "epoch": 0.9862321890218241, + "flos": 19828831415040.0, + "grad_norm": 4.4157096041666195, + "language_loss": 0.74956357, + "learning_rate": 1.971159017809976e-09, + "loss": 0.77175236, + "num_input_tokens_seen": 177029340, + "step": 8202, + "time_per_iteration": 2.626403570175171 + }, + { + "auxiliary_loss_clip": 0.01125449, + "auxiliary_loss_mlp": 0.01084122, + "balance_loss_clip": 1.02544212, + "balance_loss_mlp": 1.00401306, + "epoch": 0.9863524319124631, + "flos": 21652051904640.0, + "grad_norm": 2.240596477211036, + "language_loss": 0.77397609, + "learning_rate": 1.93673451285159e-09, + "loss": 0.79607177, + "num_input_tokens_seen": 177048390, + "step": 8203, + "time_per_iteration": 2.6900408267974854 + }, + { + "auxiliary_loss_clip": 0.0109724, + "auxiliary_loss_mlp": 0.01078904, + "balance_loss_clip": 1.01779318, + "balance_loss_mlp": 0.99993974, + "epoch": 0.9864726748031023, + "flos": 52769977920000.0, + "grad_norm": 0.7344366310682197, + "language_loss": 0.56536275, + "learning_rate": 1.9026131125019495e-09, + "loss": 0.58712417, + "num_input_tokens_seen": 177105760, + "step": 8204, + "time_per_iteration": 3.240877151489258 + }, + { + "auxiliary_loss_clip": 0.01118472, + "auxiliary_loss_mlp": 0.01083921, + "balance_loss_clip": 1.02469194, + "balance_loss_mlp": 1.00390744, + "epoch": 0.9865929176937414, + "flos": 23364631526400.0, + "grad_norm": 1.6961914493861778, + "language_loss": 0.87150091, + "learning_rate": 1.8687948219371363e-09, + "loss": 0.89352489, + "num_input_tokens_seen": 177124985, + "step": 8205, + "time_per_iteration": 4.466286897659302 + }, + { + "auxiliary_loss_clip": 0.01135196, + "auxiliary_loss_mlp": 0.0108369, + "balance_loss_clip": 1.02498353, + "balance_loss_mlp": 1.00353336, + "epoch": 0.9867131605843804, + "flos": 21616679986560.0, + "grad_norm": 1.799060601588149, + "language_loss": 0.88425666, + "learning_rate": 1.835279646287491e-09, + "loss": 0.9064455, + "num_input_tokens_seen": 177142995, + "step": 8206, + "time_per_iteration": 2.611006259918213 + }, + { + "auxiliary_loss_clip": 0.01128695, + "auxiliary_loss_mlp": 0.01085437, + "balance_loss_clip": 1.02680731, + "balance_loss_mlp": 1.00528049, + "epoch": 0.9868334034750196, + "flos": 22271403139200.0, + "grad_norm": 1.6419743025064695, + "language_loss": 0.76414925, + "learning_rate": 1.8020675906371685e-09, + "loss": 0.78629053, + "num_input_tokens_seen": 177162390, + "step": 8207, + "time_per_iteration": 2.7783753871917725 + }, + { + "auxiliary_loss_clip": 0.0109564, + "auxiliary_loss_mlp": 0.01083962, + "balance_loss_clip": 1.02219748, + "balance_loss_mlp": 1.00394869, + "epoch": 0.9869536463656586, + "flos": 25809573548160.0, + "grad_norm": 1.9301194966990884, + "language_loss": 0.75065279, + "learning_rate": 1.7691586600243612e-09, + "loss": 0.77244878, + "num_input_tokens_seen": 177181290, + "step": 8208, + "time_per_iteration": 2.791722297668457 + }, + { + "auxiliary_loss_clip": 0.01111044, + "auxiliary_loss_mlp": 0.01083253, + "balance_loss_clip": 1.02202082, + "balance_loss_mlp": 1.00328732, + "epoch": 0.9870738892562977, + "flos": 16398500613120.0, + "grad_norm": 2.570329224313118, + "language_loss": 0.86775696, + "learning_rate": 1.7365528594415202e-09, + "loss": 0.88969994, + "num_input_tokens_seen": 177195360, + "step": 8209, + "time_per_iteration": 2.7072319984436035 + }, + { + "auxiliary_loss_clip": 0.01125767, + "auxiliary_loss_mlp": 0.00872953, + "balance_loss_clip": 1.02472019, + "balance_loss_mlp": 1.00009084, + "epoch": 0.9871941321469369, + "flos": 35481358373760.0, + "grad_norm": 1.5822751930611898, + "language_loss": 0.67177665, + "learning_rate": 1.7042501938346888e-09, + "loss": 0.69176382, + "num_input_tokens_seen": 177218090, + "step": 8210, + "time_per_iteration": 2.757641315460205 + }, + { + "auxiliary_loss_clip": 0.01117567, + "auxiliary_loss_mlp": 0.01083323, + "balance_loss_clip": 1.02446508, + "balance_loss_mlp": 1.00326216, + "epoch": 0.9873143750375759, + "flos": 21434217874560.0, + "grad_norm": 1.9915032757409217, + "language_loss": 0.76447195, + "learning_rate": 1.6722506681043913e-09, + "loss": 0.78648078, + "num_input_tokens_seen": 177237050, + "step": 8211, + "time_per_iteration": 2.689713716506958 + }, + { + "auxiliary_loss_clip": 0.01117592, + "auxiliary_loss_mlp": 0.01085947, + "balance_loss_clip": 1.02554119, + "balance_loss_mlp": 1.00583863, + "epoch": 0.987434617928215, + "flos": 16326499800960.0, + "grad_norm": 1.9677276790921567, + "language_loss": 0.68905568, + "learning_rate": 1.640554287104745e-09, + "loss": 0.71109104, + "num_input_tokens_seen": 177255325, + "step": 8212, + "time_per_iteration": 2.6886844635009766 + }, + { + "auxiliary_loss_clip": 0.01107487, + "auxiliary_loss_mlp": 0.01083662, + "balance_loss_clip": 1.02307272, + "balance_loss_mlp": 1.00345755, + "epoch": 0.9875548608188541, + "flos": 17851984456320.0, + "grad_norm": 2.5155965416537196, + "language_loss": 0.79851496, + "learning_rate": 1.609161055644348e-09, + "loss": 0.82042646, + "num_input_tokens_seen": 177271250, + "step": 8213, + "time_per_iteration": 2.7295193672180176 + }, + { + "auxiliary_loss_clip": 0.01127386, + "auxiliary_loss_mlp": 0.01084241, + "balance_loss_clip": 1.02527463, + "balance_loss_mlp": 1.00408411, + "epoch": 0.9876751037094932, + "flos": 26132876887680.0, + "grad_norm": 3.3552758735085644, + "language_loss": 0.68373883, + "learning_rate": 1.5780709784849467e-09, + "loss": 0.70585513, + "num_input_tokens_seen": 177288270, + "step": 8214, + "time_per_iteration": 2.661008358001709 + }, + { + "auxiliary_loss_clip": 0.01084949, + "auxiliary_loss_mlp": 0.01083851, + "balance_loss_clip": 1.02077317, + "balance_loss_mlp": 1.00364685, + "epoch": 0.9877953466001322, + "flos": 15991344973440.0, + "grad_norm": 1.7678623291744247, + "language_loss": 0.82090473, + "learning_rate": 1.5472840603436565e-09, + "loss": 0.84259278, + "num_input_tokens_seen": 177305500, + "step": 8215, + "time_per_iteration": 2.8697516918182373 + }, + { + "auxiliary_loss_clip": 0.01098812, + "auxiliary_loss_mlp": 0.01084711, + "balance_loss_clip": 1.02416539, + "balance_loss_mlp": 1.00455451, + "epoch": 0.9879155894907714, + "flos": 18806777827200.0, + "grad_norm": 1.849119022898004, + "language_loss": 0.78093803, + "learning_rate": 1.5168003058900757e-09, + "loss": 0.80277324, + "num_input_tokens_seen": 177323500, + "step": 8216, + "time_per_iteration": 2.6884512901306152 + }, + { + "auxiliary_loss_clip": 0.01104325, + "auxiliary_loss_mlp": 0.01083645, + "balance_loss_clip": 1.02208424, + "balance_loss_mlp": 1.00353599, + "epoch": 0.9880358323814105, + "flos": 22382044007040.0, + "grad_norm": 1.9387773130533137, + "language_loss": 0.92100465, + "learning_rate": 1.4866197197491715e-09, + "loss": 0.94288433, + "num_input_tokens_seen": 177342860, + "step": 8217, + "time_per_iteration": 2.748704671859741 + }, + { + "auxiliary_loss_clip": 0.01125082, + "auxiliary_loss_mlp": 0.00872909, + "balance_loss_clip": 1.02478373, + "balance_loss_mlp": 1.00008297, + "epoch": 0.9881560752720495, + "flos": 15668831733120.0, + "grad_norm": 3.3311496822479465, + "language_loss": 0.7916851, + "learning_rate": 1.4567423064988371e-09, + "loss": 0.81166506, + "num_input_tokens_seen": 177360210, + "step": 8218, + "time_per_iteration": 2.7712392807006836 + }, + { + "auxiliary_loss_clip": 0.01134896, + "auxiliary_loss_mlp": 0.01084371, + "balance_loss_clip": 1.02502632, + "balance_loss_mlp": 1.00430954, + "epoch": 0.9882763181626887, + "flos": 21500113374720.0, + "grad_norm": 2.1573349063586407, + "language_loss": 0.78103215, + "learning_rate": 1.4271680706718913e-09, + "loss": 0.8032248, + "num_input_tokens_seen": 177377885, + "step": 8219, + "time_per_iteration": 2.6143298149108887 + }, + { + "auxiliary_loss_clip": 0.01124913, + "auxiliary_loss_mlp": 0.01085179, + "balance_loss_clip": 1.02472317, + "balance_loss_mlp": 1.00492728, + "epoch": 0.9883965610533277, + "flos": 28034598551040.0, + "grad_norm": 2.076905075553433, + "language_loss": 0.82304919, + "learning_rate": 1.3978970167543013e-09, + "loss": 0.84515017, + "num_input_tokens_seen": 177398065, + "step": 8220, + "time_per_iteration": 2.715958595275879 + }, + { + "auxiliary_loss_clip": 0.01115352, + "auxiliary_loss_mlp": 0.01084287, + "balance_loss_clip": 1.02419746, + "balance_loss_mlp": 1.00398731, + "epoch": 0.9885168039439668, + "flos": 14098601710080.0, + "grad_norm": 1.883784135600216, + "language_loss": 0.7769773, + "learning_rate": 1.3689291491867372e-09, + "loss": 0.79897368, + "num_input_tokens_seen": 177416380, + "step": 8221, + "time_per_iteration": 2.6897058486938477 + }, + { + "auxiliary_loss_clip": 0.01134841, + "auxiliary_loss_mlp": 0.01083834, + "balance_loss_clip": 1.02548051, + "balance_loss_mlp": 1.00372529, + "epoch": 0.988637046834606, + "flos": 26432013352320.0, + "grad_norm": 1.9202922573566037, + "language_loss": 0.73348677, + "learning_rate": 1.3402644723636836e-09, + "loss": 0.75567359, + "num_input_tokens_seen": 177438410, + "step": 8222, + "time_per_iteration": 2.717984676361084 + }, + { + "auxiliary_loss_clip": 0.01108187, + "auxiliary_loss_mlp": 0.01084792, + "balance_loss_clip": 1.02344847, + "balance_loss_mlp": 1.00458765, + "epoch": 0.988757289725245, + "flos": 25229113764480.0, + "grad_norm": 2.1187067012282546, + "language_loss": 0.8379243, + "learning_rate": 1.311902990633218e-09, + "loss": 0.8598541, + "num_input_tokens_seen": 177457375, + "step": 8223, + "time_per_iteration": 2.7573089599609375 + }, + { + "auxiliary_loss_clip": 0.01116635, + "auxiliary_loss_mlp": 0.01083624, + "balance_loss_clip": 1.02353716, + "balance_loss_mlp": 1.00351572, + "epoch": 0.9888775326158841, + "flos": 26359042872960.0, + "grad_norm": 1.521337654213227, + "language_loss": 0.71357042, + "learning_rate": 1.2838447082978987e-09, + "loss": 0.73557293, + "num_input_tokens_seen": 177478530, + "step": 8224, + "time_per_iteration": 2.8356385231018066 + }, + { + "auxiliary_loss_clip": 0.01125724, + "auxiliary_loss_mlp": 0.01084512, + "balance_loss_clip": 1.02458, + "balance_loss_mlp": 1.00435591, + "epoch": 0.9889977755065231, + "flos": 24316120846080.0, + "grad_norm": 2.276193602011884, + "language_loss": 0.82985544, + "learning_rate": 1.2560896296143208e-09, + "loss": 0.8519578, + "num_input_tokens_seen": 177496995, + "step": 8225, + "time_per_iteration": 3.617575168609619 + }, + { + "auxiliary_loss_clip": 0.01134788, + "auxiliary_loss_mlp": 0.01084415, + "balance_loss_clip": 1.02549827, + "balance_loss_mlp": 1.00425863, + "epoch": 0.9891180183971623, + "flos": 18951066760320.0, + "grad_norm": 2.230573133094728, + "language_loss": 0.81879908, + "learning_rate": 1.2286377587926722e-09, + "loss": 0.84099114, + "num_input_tokens_seen": 177513785, + "step": 8226, + "time_per_iteration": 3.4831607341766357 + }, + { + "auxiliary_loss_clip": 0.01134048, + "auxiliary_loss_mlp": 0.01084169, + "balance_loss_clip": 1.02472425, + "balance_loss_mlp": 1.00401318, + "epoch": 0.9892382612878013, + "flos": 26176580760960.0, + "grad_norm": 1.9443710707870117, + "language_loss": 0.74912286, + "learning_rate": 1.2014890999973992e-09, + "loss": 0.77130502, + "num_input_tokens_seen": 177530705, + "step": 8227, + "time_per_iteration": 2.672422170639038 + }, + { + "auxiliary_loss_clip": 0.01134308, + "auxiliary_loss_mlp": 0.01083887, + "balance_loss_clip": 1.02488053, + "balance_loss_mlp": 1.0038259, + "epoch": 0.9893585041784404, + "flos": 25449605400960.0, + "grad_norm": 2.491663631306718, + "language_loss": 0.78447646, + "learning_rate": 1.1746436573472073e-09, + "loss": 0.80665839, + "num_input_tokens_seen": 177552440, + "step": 8228, + "time_per_iteration": 2.622634172439575 + }, + { + "auxiliary_loss_clip": 0.01119148, + "auxiliary_loss_mlp": 0.01085118, + "balance_loss_clip": 1.02546716, + "balance_loss_mlp": 1.00496221, + "epoch": 0.9894787470690796, + "flos": 20189302352640.0, + "grad_norm": 1.8058599001644, + "language_loss": 0.68961859, + "learning_rate": 1.1481014349141726e-09, + "loss": 0.71166122, + "num_input_tokens_seen": 177569660, + "step": 8229, + "time_per_iteration": 2.7047765254974365 + }, + { + "auxiliary_loss_clip": 0.01114324, + "auxiliary_loss_mlp": 0.01083829, + "balance_loss_clip": 1.02363741, + "balance_loss_mlp": 1.00362504, + "epoch": 0.9895989899597186, + "flos": 24644308435200.0, + "grad_norm": 1.8331016424973974, + "language_loss": 0.84235638, + "learning_rate": 1.121862436724852e-09, + "loss": 0.86433792, + "num_input_tokens_seen": 177588500, + "step": 8230, + "time_per_iteration": 3.6904637813568115 + }, + { + "auxiliary_loss_clip": 0.01129196, + "auxiliary_loss_mlp": 0.01084243, + "balance_loss_clip": 1.02741551, + "balance_loss_mlp": 1.00413442, + "epoch": 0.9897192328503577, + "flos": 21799034357760.0, + "grad_norm": 1.8192518475135149, + "language_loss": 0.70441878, + "learning_rate": 1.0959266667598388e-09, + "loss": 0.72655308, + "num_input_tokens_seen": 177607315, + "step": 8231, + "time_per_iteration": 3.6077306270599365 + }, + { + "auxiliary_loss_clip": 0.01104727, + "auxiliary_loss_mlp": 0.01084791, + "balance_loss_clip": 1.02265096, + "balance_loss_mlp": 1.00453925, + "epoch": 0.9898394757409968, + "flos": 21325229032320.0, + "grad_norm": 2.9262763751683796, + "language_loss": 0.74730146, + "learning_rate": 1.0702941289533196e-09, + "loss": 0.76919669, + "num_input_tokens_seen": 177625990, + "step": 8232, + "time_per_iteration": 2.7073898315429688 + }, + { + "auxiliary_loss_clip": 0.01105629, + "auxiliary_loss_mlp": 0.01083953, + "balance_loss_clip": 1.0235889, + "balance_loss_mlp": 1.00393963, + "epoch": 0.9899597186316359, + "flos": 18545024442240.0, + "grad_norm": 2.080858174539259, + "language_loss": 0.89029479, + "learning_rate": 1.0449648271939615e-09, + "loss": 0.91219056, + "num_input_tokens_seen": 177642335, + "step": 8233, + "time_per_iteration": 2.7560012340545654 + }, + { + "auxiliary_loss_clip": 0.01091797, + "auxiliary_loss_mlp": 0.00872832, + "balance_loss_clip": 1.01927495, + "balance_loss_mlp": 1.00006878, + "epoch": 0.990079961522275, + "flos": 23766723348480.0, + "grad_norm": 1.4295182371901132, + "language_loss": 0.72581851, + "learning_rate": 1.0199387653240243e-09, + "loss": 0.7454648, + "num_input_tokens_seen": 177662025, + "step": 8234, + "time_per_iteration": 2.825270414352417 + }, + { + "auxiliary_loss_clip": 0.01117687, + "auxiliary_loss_mlp": 0.01083799, + "balance_loss_clip": 1.02557516, + "balance_loss_mlp": 1.00378561, + "epoch": 0.9902002044129141, + "flos": 16399182971520.0, + "grad_norm": 2.542934198216227, + "language_loss": 0.7088896, + "learning_rate": 9.952159471400267e-10, + "loss": 0.73090446, + "num_input_tokens_seen": 177679065, + "step": 8235, + "time_per_iteration": 2.7041397094726562 + }, + { + "auxiliary_loss_clip": 0.01109473, + "auxiliary_loss_mlp": 0.00872763, + "balance_loss_clip": 1.02554429, + "balance_loss_mlp": 1.00008726, + "epoch": 0.9903204473035532, + "flos": 22559657783040.0, + "grad_norm": 1.7873341254485309, + "language_loss": 0.84361184, + "learning_rate": 9.707963763923022e-10, + "loss": 0.8634342, + "num_input_tokens_seen": 177698115, + "step": 8236, + "time_per_iteration": 2.694287061691284 + }, + { + "auxiliary_loss_clip": 0.01116694, + "auxiliary_loss_mlp": 0.01084787, + "balance_loss_clip": 1.02400041, + "balance_loss_mlp": 1.00477362, + "epoch": 0.9904406901941922, + "flos": 16144001775360.0, + "grad_norm": 2.220580367264424, + "language_loss": 0.79174018, + "learning_rate": 9.466800567854427e-10, + "loss": 0.81375504, + "num_input_tokens_seen": 177716715, + "step": 8237, + "time_per_iteration": 2.7265031337738037 + }, + { + "auxiliary_loss_clip": 0.01094193, + "auxiliary_loss_mlp": 0.01084773, + "balance_loss_clip": 1.02536702, + "balance_loss_mlp": 1.00456882, + "epoch": 0.9905609330848314, + "flos": 26651499408000.0, + "grad_norm": 1.8129345620025916, + "language_loss": 0.67788494, + "learning_rate": 9.228669919778553e-10, + "loss": 0.69967461, + "num_input_tokens_seen": 177735640, + "step": 8238, + "time_per_iteration": 2.8012449741363525 + }, + { + "auxiliary_loss_clip": 0.01102133, + "auxiliary_loss_mlp": 0.01083431, + "balance_loss_clip": 1.02582657, + "balance_loss_mlp": 1.00332212, + "epoch": 0.9906811759754705, + "flos": 23111820627840.0, + "grad_norm": 2.0885437429219427, + "language_loss": 0.79231274, + "learning_rate": 8.993571855817617e-10, + "loss": 0.81416845, + "num_input_tokens_seen": 177754470, + "step": 8239, + "time_per_iteration": 2.7324471473693848 + }, + { + "auxiliary_loss_clip": 0.0112311, + "auxiliary_loss_mlp": 0.01083898, + "balance_loss_clip": 1.02310419, + "balance_loss_mlp": 1.0037415, + "epoch": 0.9908014188661095, + "flos": 22090593052800.0, + "grad_norm": 1.9011896388816114, + "language_loss": 0.7509371, + "learning_rate": 8.761506411638642e-10, + "loss": 0.77300715, + "num_input_tokens_seen": 177773935, + "step": 8240, + "time_per_iteration": 2.6224539279937744 + }, + { + "auxiliary_loss_clip": 0.01110791, + "auxiliary_loss_mlp": 0.0108419, + "balance_loss_clip": 1.02089763, + "balance_loss_mlp": 1.00412881, + "epoch": 0.9909216617567487, + "flos": 19242948677760.0, + "grad_norm": 1.6820675287672877, + "language_loss": 0.73446202, + "learning_rate": 8.53247362244236e-10, + "loss": 0.75641179, + "num_input_tokens_seen": 177792745, + "step": 8241, + "time_per_iteration": 2.8051486015319824 + }, + { + "auxiliary_loss_clip": 0.01114881, + "auxiliary_loss_mlp": 0.01084277, + "balance_loss_clip": 1.02373612, + "balance_loss_mlp": 1.00412059, + "epoch": 0.9910419046473877, + "flos": 23621213352960.0, + "grad_norm": 1.6018579042808245, + "language_loss": 0.68221605, + "learning_rate": 8.306473522976532e-10, + "loss": 0.70420766, + "num_input_tokens_seen": 177812150, + "step": 8242, + "time_per_iteration": 2.708744525909424 + }, + { + "auxiliary_loss_clip": 0.01136403, + "auxiliary_loss_mlp": 0.0108469, + "balance_loss_clip": 1.02671218, + "balance_loss_mlp": 1.00458145, + "epoch": 0.9911621475380268, + "flos": 22711380831360.0, + "grad_norm": 1.8958274558430577, + "language_loss": 0.715967, + "learning_rate": 8.083506147522623e-10, + "loss": 0.73817796, + "num_input_tokens_seen": 177831545, + "step": 8243, + "time_per_iteration": 2.631988763809204 + }, + { + "auxiliary_loss_clip": 0.01125955, + "auxiliary_loss_mlp": 0.01084682, + "balance_loss_clip": 1.02500701, + "balance_loss_mlp": 1.00457311, + "epoch": 0.991282390428666, + "flos": 13516956777600.0, + "grad_norm": 2.015568911027952, + "language_loss": 0.85130548, + "learning_rate": 7.863571529906909e-10, + "loss": 0.87341183, + "num_input_tokens_seen": 177847130, + "step": 8244, + "time_per_iteration": 2.637860059738159 + }, + { + "auxiliary_loss_clip": 0.01104958, + "auxiliary_loss_mlp": 0.01079127, + "balance_loss_clip": 1.01760352, + "balance_loss_mlp": 1.00016236, + "epoch": 0.991402633319305, + "flos": 61830492071040.0, + "grad_norm": 1.2492416606039074, + "language_loss": 0.59673226, + "learning_rate": 7.646669703489372e-10, + "loss": 0.61857307, + "num_input_tokens_seen": 177911440, + "step": 8245, + "time_per_iteration": 3.3503832817077637 + }, + { + "auxiliary_loss_clip": 0.01056586, + "auxiliary_loss_mlp": 0.01083584, + "balance_loss_clip": 1.02083886, + "balance_loss_mlp": 1.00357056, + "epoch": 0.9915228762099441, + "flos": 18770148933120.0, + "grad_norm": 2.036076824660223, + "language_loss": 0.57241535, + "learning_rate": 7.432800701177023e-10, + "loss": 0.593817, + "num_input_tokens_seen": 177929440, + "step": 8246, + "time_per_iteration": 3.1290457248687744 + }, + { + "auxiliary_loss_clip": 0.01097021, + "auxiliary_loss_mlp": 0.01078801, + "balance_loss_clip": 1.01806116, + "balance_loss_mlp": 0.99983674, + "epoch": 0.9916431191005832, + "flos": 65936660244480.0, + "grad_norm": 0.8042596146422386, + "language_loss": 0.57860625, + "learning_rate": 7.221964555415017e-10, + "loss": 0.60036451, + "num_input_tokens_seen": 177989100, + "step": 8247, + "time_per_iteration": 3.4183332920074463 + }, + { + "auxiliary_loss_clip": 0.01116637, + "auxiliary_loss_mlp": 0.01082911, + "balance_loss_clip": 1.02417874, + "balance_loss_mlp": 1.0029459, + "epoch": 0.9917633619912223, + "flos": 16581573256320.0, + "grad_norm": 1.688000049717295, + "language_loss": 0.74982774, + "learning_rate": 7.01416129818222e-10, + "loss": 0.77182323, + "num_input_tokens_seen": 178006720, + "step": 8248, + "time_per_iteration": 2.7310094833374023 + }, + { + "auxiliary_loss_clip": 0.0110707, + "auxiliary_loss_mlp": 0.01084594, + "balance_loss_clip": 1.0227536, + "balance_loss_mlp": 1.00443792, + "epoch": 0.9918836048818613, + "flos": 25411108999680.0, + "grad_norm": 1.958838796628239, + "language_loss": 0.58550906, + "learning_rate": 6.809390961006745e-10, + "loss": 0.60742569, + "num_input_tokens_seen": 178026850, + "step": 8249, + "time_per_iteration": 2.8315563201904297 + }, + { + "auxiliary_loss_clip": 0.01109211, + "auxiliary_loss_mlp": 0.01084363, + "balance_loss_clip": 1.02340519, + "balance_loss_mlp": 1.00420713, + "epoch": 0.9920038477725005, + "flos": 25046867134080.0, + "grad_norm": 1.692109192052535, + "language_loss": 0.68256283, + "learning_rate": 6.607653574948191e-10, + "loss": 0.70449859, + "num_input_tokens_seen": 178047630, + "step": 8250, + "time_per_iteration": 2.7170674800872803 + }, + { + "auxiliary_loss_clip": 0.01124311, + "auxiliary_loss_mlp": 0.01084701, + "balance_loss_clip": 1.02374887, + "balance_loss_mlp": 1.00468802, + "epoch": 0.9921240906631396, + "flos": 21829773421440.0, + "grad_norm": 1.8611243630763534, + "language_loss": 0.81678104, + "learning_rate": 6.408949170613187e-10, + "loss": 0.83887118, + "num_input_tokens_seen": 178066895, + "step": 8251, + "time_per_iteration": 4.694150447845459 + }, + { + "auxiliary_loss_clip": 0.01117848, + "auxiliary_loss_mlp": 0.01084591, + "balance_loss_clip": 1.02485597, + "balance_loss_mlp": 1.00443494, + "epoch": 0.9922443335537786, + "flos": 24864225454080.0, + "grad_norm": 1.6134066579244724, + "language_loss": 0.82023627, + "learning_rate": 6.213277778144288e-10, + "loss": 0.8422606, + "num_input_tokens_seen": 178088540, + "step": 8252, + "time_per_iteration": 2.8275790214538574 + }, + { + "auxiliary_loss_clip": 0.01071925, + "auxiliary_loss_mlp": 0.01085112, + "balance_loss_clip": 1.02228665, + "balance_loss_mlp": 1.00490808, + "epoch": 0.9923645764444178, + "flos": 21613088626560.0, + "grad_norm": 1.9409262105252563, + "language_loss": 0.67516869, + "learning_rate": 6.020639427224416e-10, + "loss": 0.69673908, + "num_input_tokens_seen": 178106185, + "step": 8253, + "time_per_iteration": 2.8190886974334717 + }, + { + "auxiliary_loss_clip": 0.01115376, + "auxiliary_loss_mlp": 0.01083734, + "balance_loss_clip": 1.02414632, + "balance_loss_mlp": 1.0036726, + "epoch": 0.9924848193350568, + "flos": 25001798544000.0, + "grad_norm": 2.0351996433302273, + "language_loss": 0.73107392, + "learning_rate": 5.831034147076864e-10, + "loss": 0.75306499, + "num_input_tokens_seen": 178123435, + "step": 8254, + "time_per_iteration": 2.8411076068878174 + }, + { + "auxiliary_loss_clip": 0.01104805, + "auxiliary_loss_mlp": 0.01079194, + "balance_loss_clip": 1.01841438, + "balance_loss_mlp": 1.00022936, + "epoch": 0.9926050622256959, + "flos": 68912543151360.0, + "grad_norm": 0.6846515467750249, + "language_loss": 0.55777943, + "learning_rate": 5.644461966463065e-10, + "loss": 0.57961947, + "num_input_tokens_seen": 178191045, + "step": 8255, + "time_per_iteration": 4.2049665451049805 + }, + { + "auxiliary_loss_clip": 0.01114146, + "auxiliary_loss_mlp": 0.0108312, + "balance_loss_clip": 1.02339292, + "balance_loss_mlp": 1.00305927, + "epoch": 0.9927253051163349, + "flos": 20923675914240.0, + "grad_norm": 3.034381322383284, + "language_loss": 0.75813669, + "learning_rate": 5.460922913687049e-10, + "loss": 0.78010935, + "num_input_tokens_seen": 178210135, + "step": 8256, + "time_per_iteration": 3.648015022277832 + }, + { + "auxiliary_loss_clip": 0.01098654, + "auxiliary_loss_mlp": 0.00872983, + "balance_loss_clip": 1.02243388, + "balance_loss_mlp": 1.00003803, + "epoch": 0.9928455480069741, + "flos": 22308211601280.0, + "grad_norm": 2.191214647091391, + "language_loss": 0.75159919, + "learning_rate": 5.280417016593208e-10, + "loss": 0.77131552, + "num_input_tokens_seen": 178229925, + "step": 8257, + "time_per_iteration": 2.811823844909668 + }, + { + "auxiliary_loss_clip": 0.01125688, + "auxiliary_loss_mlp": 0.00872889, + "balance_loss_clip": 1.02659416, + "balance_loss_mlp": 1.00010443, + "epoch": 0.9929657908976132, + "flos": 17383889393280.0, + "grad_norm": 1.6405261707390564, + "language_loss": 0.74871266, + "learning_rate": 5.102944302559642e-10, + "loss": 0.76869845, + "num_input_tokens_seen": 178247420, + "step": 8258, + "time_per_iteration": 2.6490914821624756 + }, + { + "auxiliary_loss_clip": 0.01083852, + "auxiliary_loss_mlp": 0.01085527, + "balance_loss_clip": 1.01924825, + "balance_loss_mlp": 1.00546646, + "epoch": 0.9930860337882522, + "flos": 22674680110080.0, + "grad_norm": 2.3502314898115957, + "language_loss": 0.7934016, + "learning_rate": 4.9285047985137e-10, + "loss": 0.81509531, + "num_input_tokens_seen": 178266840, + "step": 8259, + "time_per_iteration": 2.847919225692749 + }, + { + "auxiliary_loss_clip": 0.01128057, + "auxiliary_loss_mlp": 0.0108456, + "balance_loss_clip": 1.02649152, + "balance_loss_mlp": 1.00435638, + "epoch": 0.9932062766788914, + "flos": 28147789284480.0, + "grad_norm": 1.841872335796994, + "language_loss": 0.74686933, + "learning_rate": 4.757098530916436e-10, + "loss": 0.76899552, + "num_input_tokens_seen": 178287285, + "step": 8260, + "time_per_iteration": 2.750519275665283 + }, + { + "auxiliary_loss_clip": 0.01125476, + "auxiliary_loss_mlp": 0.01083865, + "balance_loss_clip": 1.0255723, + "balance_loss_mlp": 1.0037564, + "epoch": 0.9933265195695304, + "flos": 20156659868160.0, + "grad_norm": 3.176937054205833, + "language_loss": 0.77386272, + "learning_rate": 4.5887255257670563e-10, + "loss": 0.79595613, + "num_input_tokens_seen": 178304325, + "step": 8261, + "time_per_iteration": 2.6936659812927246 + }, + { + "auxiliary_loss_clip": 0.01134096, + "auxiliary_loss_mlp": 0.01083441, + "balance_loss_clip": 1.02492285, + "balance_loss_mlp": 1.00337982, + "epoch": 0.9934467624601695, + "flos": 21362037494400.0, + "grad_norm": 1.854789040708131, + "language_loss": 0.7691977, + "learning_rate": 4.4233858086117906e-10, + "loss": 0.79137301, + "num_input_tokens_seen": 178322850, + "step": 8262, + "time_per_iteration": 2.670557975769043 + }, + { + "auxiliary_loss_clip": 0.0109463, + "auxiliary_loss_mlp": 0.01084094, + "balance_loss_clip": 1.02344537, + "balance_loss_mlp": 1.00403333, + "epoch": 0.9935670053508087, + "flos": 19756040503680.0, + "grad_norm": 3.4085609243818005, + "language_loss": 0.67859381, + "learning_rate": 4.261079404528356e-10, + "loss": 0.70038104, + "num_input_tokens_seen": 178342330, + "step": 8263, + "time_per_iteration": 2.778122901916504 + }, + { + "auxiliary_loss_clip": 0.01126862, + "auxiliary_loss_mlp": 0.01084437, + "balance_loss_clip": 1.0250895, + "balance_loss_mlp": 1.00428104, + "epoch": 0.9936872482414477, + "flos": 21978838863360.0, + "grad_norm": 1.6401893618711898, + "language_loss": 0.68942654, + "learning_rate": 4.1018063381437205e-10, + "loss": 0.71153957, + "num_input_tokens_seen": 178362715, + "step": 8264, + "time_per_iteration": 2.7152819633483887 + }, + { + "auxiliary_loss_clip": 0.01099367, + "auxiliary_loss_mlp": 0.01079218, + "balance_loss_clip": 1.02164638, + "balance_loss_mlp": 1.00025356, + "epoch": 0.9938074911320868, + "flos": 69810667839360.0, + "grad_norm": 0.8583966091316498, + "language_loss": 0.61184645, + "learning_rate": 3.9455666336141167e-10, + "loss": 0.6336323, + "num_input_tokens_seen": 178426495, + "step": 8265, + "time_per_iteration": 3.295780897140503 + }, + { + "auxiliary_loss_clip": 0.01134473, + "auxiliary_loss_mlp": 0.01083896, + "balance_loss_clip": 1.02596831, + "balance_loss_mlp": 1.00378704, + "epoch": 0.9939277340227259, + "flos": 15084170058240.0, + "grad_norm": 3.357436243168958, + "language_loss": 0.83033276, + "learning_rate": 3.7923603146450267e-10, + "loss": 0.85251647, + "num_input_tokens_seen": 178442555, + "step": 8266, + "time_per_iteration": 2.6446008682250977 + }, + { + "auxiliary_loss_clip": 0.01105303, + "auxiliary_loss_mlp": 0.01083809, + "balance_loss_clip": 1.02262092, + "balance_loss_mlp": 1.00365233, + "epoch": 0.994047976913365, + "flos": 17712364291200.0, + "grad_norm": 2.155397104484302, + "language_loss": 0.80885804, + "learning_rate": 3.642187404473418e-10, + "loss": 0.83074915, + "num_input_tokens_seen": 178460715, + "step": 8267, + "time_per_iteration": 2.668755531311035 + }, + { + "auxiliary_loss_clip": 0.01125469, + "auxiliary_loss_mlp": 0.01083322, + "balance_loss_clip": 1.02474391, + "balance_loss_mlp": 1.00326133, + "epoch": 0.994168219804004, + "flos": 19171558396800.0, + "grad_norm": 2.5229561527837747, + "language_loss": 0.85787261, + "learning_rate": 3.495047925885508e-10, + "loss": 0.87996054, + "num_input_tokens_seen": 178479050, + "step": 8268, + "time_per_iteration": 2.682372570037842 + }, + { + "auxiliary_loss_clip": 0.01119137, + "auxiliary_loss_mlp": 0.01084594, + "balance_loss_clip": 1.02595949, + "balance_loss_mlp": 1.00434184, + "epoch": 0.9942884626946432, + "flos": 17851589406720.0, + "grad_norm": 2.4568708594601794, + "language_loss": 0.83089173, + "learning_rate": 3.350941901199e-10, + "loss": 0.852929, + "num_input_tokens_seen": 178495970, + "step": 8269, + "time_per_iteration": 2.708434581756592 + }, + { + "auxiliary_loss_clip": 0.01117631, + "auxiliary_loss_mlp": 0.01084323, + "balance_loss_clip": 1.02492714, + "balance_loss_mlp": 1.00426221, + "epoch": 0.9944087055852823, + "flos": 18796578364800.0, + "grad_norm": 14.068058501814253, + "language_loss": 0.83368087, + "learning_rate": 3.2098693522764066e-10, + "loss": 0.85570037, + "num_input_tokens_seen": 178509170, + "step": 8270, + "time_per_iteration": 2.7253477573394775 + }, + { + "auxiliary_loss_clip": 0.01117041, + "auxiliary_loss_mlp": 0.00872913, + "balance_loss_clip": 1.02438772, + "balance_loss_mlp": 1.00009727, + "epoch": 0.9945289484759213, + "flos": 20996969616000.0, + "grad_norm": 2.1059618375732727, + "language_loss": 0.80858362, + "learning_rate": 3.071830300516165e-10, + "loss": 0.8284831, + "num_input_tokens_seen": 178527000, + "step": 8271, + "time_per_iteration": 2.793539524078369 + }, + { + "auxiliary_loss_clip": 0.01125699, + "auxiliary_loss_mlp": 0.01085317, + "balance_loss_clip": 1.02450764, + "balance_loss_mlp": 1.00511312, + "epoch": 0.9946491913665605, + "flos": 14756952136320.0, + "grad_norm": 2.5194304635487743, + "language_loss": 0.70847005, + "learning_rate": 2.9368247668615234e-10, + "loss": 0.73058021, + "num_input_tokens_seen": 178545590, + "step": 8272, + "time_per_iteration": 2.6505627632141113 + }, + { + "auxiliary_loss_clip": 0.01135995, + "auxiliary_loss_mlp": 0.01084347, + "balance_loss_clip": 1.02664661, + "balance_loss_mlp": 1.00409484, + "epoch": 0.9947694342571995, + "flos": 12669931186560.0, + "grad_norm": 2.32758299681854, + "language_loss": 0.61194813, + "learning_rate": 2.804852771789434e-10, + "loss": 0.63415158, + "num_input_tokens_seen": 178558890, + "step": 8273, + "time_per_iteration": 2.70003604888916 + }, + { + "auxiliary_loss_clip": 0.01133797, + "auxiliary_loss_mlp": 0.01083378, + "balance_loss_clip": 1.02455902, + "balance_loss_mlp": 1.00331724, + "epoch": 0.9948896771478386, + "flos": 18843442634880.0, + "grad_norm": 3.409072920876816, + "language_loss": 0.5576278, + "learning_rate": 2.675914335321661e-10, + "loss": 0.57979953, + "num_input_tokens_seen": 178577645, + "step": 8274, + "time_per_iteration": 2.5779221057891846 + }, + { + "auxiliary_loss_clip": 0.01126578, + "auxiliary_loss_mlp": 0.01083745, + "balance_loss_clip": 1.0251658, + "balance_loss_mlp": 1.00354087, + "epoch": 0.9950099200384778, + "flos": 24900207903360.0, + "grad_norm": 2.3933941262199694, + "language_loss": 0.79512322, + "learning_rate": 2.550009477018111e-10, + "loss": 0.81722641, + "num_input_tokens_seen": 178596415, + "step": 8275, + "time_per_iteration": 2.703324317932129 + }, + { + "auxiliary_loss_clip": 0.01114314, + "auxiliary_loss_mlp": 0.00872948, + "balance_loss_clip": 1.02337158, + "balance_loss_mlp": 1.00004888, + "epoch": 0.9951301629291168, + "flos": 23733613987200.0, + "grad_norm": 2.872925983600078, + "language_loss": 0.62894046, + "learning_rate": 2.4271382159790634e-10, + "loss": 0.64881313, + "num_input_tokens_seen": 178613845, + "step": 8276, + "time_per_iteration": 3.6458659172058105 + }, + { + "auxiliary_loss_clip": 0.01073953, + "auxiliary_loss_mlp": 0.01084242, + "balance_loss_clip": 1.02405214, + "balance_loss_mlp": 1.00418139, + "epoch": 0.9952504058197559, + "flos": 22236893147520.0, + "grad_norm": 3.559569914980001, + "language_loss": 0.85837597, + "learning_rate": 2.3073005708429406e-10, + "loss": 0.87995791, + "num_input_tokens_seen": 178633490, + "step": 8277, + "time_per_iteration": 3.7005062103271484 + }, + { + "auxiliary_loss_clip": 0.01105887, + "auxiliary_loss_mlp": 0.01082761, + "balance_loss_clip": 1.0240016, + "balance_loss_mlp": 1.0028429, + "epoch": 0.995370648710395, + "flos": 21211032718080.0, + "grad_norm": 1.7527393262774638, + "language_loss": 0.71919852, + "learning_rate": 2.190496559788535e-10, + "loss": 0.74108499, + "num_input_tokens_seen": 178651775, + "step": 8278, + "time_per_iteration": 2.8615262508392334 + }, + { + "auxiliary_loss_clip": 0.01112923, + "auxiliary_loss_mlp": 0.0108307, + "balance_loss_clip": 1.02340627, + "balance_loss_mlp": 1.00296175, + "epoch": 0.9954908916010341, + "flos": 14866731077760.0, + "grad_norm": 2.2848477285359623, + "language_loss": 0.766325, + "learning_rate": 2.0767262005372265e-10, + "loss": 0.78828496, + "num_input_tokens_seen": 178669290, + "step": 8279, + "time_per_iteration": 2.735297203063965 + }, + { + "auxiliary_loss_clip": 0.01093787, + "auxiliary_loss_mlp": 0.01084822, + "balance_loss_clip": 1.02526903, + "balance_loss_mlp": 1.00466585, + "epoch": 0.9956111344916732, + "flos": 19208259118080.0, + "grad_norm": 1.7858935648435745, + "language_loss": 0.75045431, + "learning_rate": 1.965989510346322e-10, + "loss": 0.77224046, + "num_input_tokens_seen": 178688410, + "step": 8280, + "time_per_iteration": 2.749802589416504 + }, + { + "auxiliary_loss_clip": 0.01095909, + "auxiliary_loss_mlp": 0.01084637, + "balance_loss_clip": 1.02199268, + "balance_loss_mlp": 1.0044328, + "epoch": 0.9957313773823123, + "flos": 20047060494720.0, + "grad_norm": 2.0014290654294133, + "language_loss": 0.70974684, + "learning_rate": 1.8582865060134955e-10, + "loss": 0.73155236, + "num_input_tokens_seen": 178706600, + "step": 8281, + "time_per_iteration": 3.9402670860290527 + }, + { + "auxiliary_loss_clip": 0.01112962, + "auxiliary_loss_mlp": 0.0107886, + "balance_loss_clip": 1.01778483, + "balance_loss_mlp": 0.99989551, + "epoch": 0.9958516202729514, + "flos": 57483253768320.0, + "grad_norm": 0.7814487388019836, + "language_loss": 0.55801511, + "learning_rate": 1.7536172038790098e-10, + "loss": 0.57993329, + "num_input_tokens_seen": 178766910, + "step": 8282, + "time_per_iteration": 4.184683084487915 + }, + { + "auxiliary_loss_clip": 0.01116025, + "auxiliary_loss_mlp": 0.01083116, + "balance_loss_clip": 1.02473271, + "balance_loss_mlp": 1.00300765, + "epoch": 0.9959718631635904, + "flos": 27782900974080.0, + "grad_norm": 2.0399270653230475, + "language_loss": 0.68920815, + "learning_rate": 1.651981619819054e-10, + "loss": 0.71119964, + "num_input_tokens_seen": 178784060, + "step": 8283, + "time_per_iteration": 2.803250551223755 + }, + { + "auxiliary_loss_clip": 0.01095736, + "auxiliary_loss_mlp": 0.01084371, + "balance_loss_clip": 1.02188849, + "balance_loss_mlp": 1.00426245, + "epoch": 0.9960921060542296, + "flos": 24024095274240.0, + "grad_norm": 2.4684163984988547, + "language_loss": 0.70641595, + "learning_rate": 1.5533797692546257e-10, + "loss": 0.72821701, + "num_input_tokens_seen": 178802795, + "step": 8284, + "time_per_iteration": 2.860767126083374 + }, + { + "auxiliary_loss_clip": 0.0112732, + "auxiliary_loss_mlp": 0.01084662, + "balance_loss_clip": 1.02571571, + "balance_loss_mlp": 1.00450563, + "epoch": 0.9962123489448687, + "flos": 18697393935360.0, + "grad_norm": 1.8298064752591003, + "language_loss": 0.84141099, + "learning_rate": 1.4578116671404296e-10, + "loss": 0.86353076, + "num_input_tokens_seen": 178821075, + "step": 8285, + "time_per_iteration": 2.6159751415252686 + }, + { + "auxiliary_loss_clip": 0.01120409, + "auxiliary_loss_mlp": 0.01083756, + "balance_loss_clip": 1.02659738, + "balance_loss_mlp": 1.00364709, + "epoch": 0.9963325918355077, + "flos": 20010754823040.0, + "grad_norm": 8.913522272318309, + "language_loss": 0.71287727, + "learning_rate": 1.3652773279759777e-10, + "loss": 0.73491883, + "num_input_tokens_seen": 178837725, + "step": 8286, + "time_per_iteration": 2.7176454067230225 + }, + { + "auxiliary_loss_clip": 0.01127058, + "auxiliary_loss_mlp": 0.01083895, + "balance_loss_clip": 1.02574825, + "balance_loss_mlp": 1.0037868, + "epoch": 0.9964528347261468, + "flos": 33108488991360.0, + "grad_norm": 1.9109623535515927, + "language_loss": 0.62955403, + "learning_rate": 1.2757767657989305e-10, + "loss": 0.65166354, + "num_input_tokens_seen": 178861515, + "step": 8287, + "time_per_iteration": 2.769575357437134 + }, + { + "auxiliary_loss_clip": 0.01124615, + "auxiliary_loss_mlp": 0.01083693, + "balance_loss_clip": 1.02523661, + "balance_loss_mlp": 1.00358391, + "epoch": 0.9965730776167859, + "flos": 23109342589440.0, + "grad_norm": 4.132261459544786, + "language_loss": 0.87141794, + "learning_rate": 1.1893099941850948e-10, + "loss": 0.89350104, + "num_input_tokens_seen": 178880410, + "step": 8288, + "time_per_iteration": 2.7152795791625977 + }, + { + "auxiliary_loss_clip": 0.01116729, + "auxiliary_loss_mlp": 0.01084004, + "balance_loss_clip": 1.023718, + "balance_loss_mlp": 1.00384748, + "epoch": 0.996693320507425, + "flos": 22965843755520.0, + "grad_norm": 2.0197355450460837, + "language_loss": 0.77426195, + "learning_rate": 1.105877026252866e-10, + "loss": 0.7962693, + "num_input_tokens_seen": 178898740, + "step": 8289, + "time_per_iteration": 2.7392735481262207 + }, + { + "auxiliary_loss_clip": 0.011343, + "auxiliary_loss_mlp": 0.01083821, + "balance_loss_clip": 1.0251087, + "balance_loss_mlp": 1.00366449, + "epoch": 0.996813563398064, + "flos": 13222740476160.0, + "grad_norm": 1.7613555180364735, + "language_loss": 0.72257221, + "learning_rate": 1.0254778746565663e-10, + "loss": 0.74475336, + "num_input_tokens_seen": 178914015, + "step": 8290, + "time_per_iteration": 2.633817672729492 + }, + { + "auxiliary_loss_clip": 0.01103983, + "auxiliary_loss_mlp": 0.01084549, + "balance_loss_clip": 1.02291214, + "balance_loss_mlp": 1.00458372, + "epoch": 0.9969338062887032, + "flos": 14647855553280.0, + "grad_norm": 1.9402062102057875, + "language_loss": 0.73031545, + "learning_rate": 9.481125515953259e-11, + "loss": 0.75220072, + "num_input_tokens_seen": 178932075, + "step": 8291, + "time_per_iteration": 2.743302583694458 + }, + { + "auxiliary_loss_clip": 0.01099598, + "auxiliary_loss_mlp": 0.01084046, + "balance_loss_clip": 1.02346802, + "balance_loss_mlp": 1.0038898, + "epoch": 0.9970540491793423, + "flos": 25735741142400.0, + "grad_norm": 1.6164384062997441, + "language_loss": 0.79719079, + "learning_rate": 8.737810688064228e-11, + "loss": 0.81902719, + "num_input_tokens_seen": 178951910, + "step": 8292, + "time_per_iteration": 2.8670265674591064 + }, + { + "auxiliary_loss_clip": 0.01101423, + "auxiliary_loss_mlp": 0.01083241, + "balance_loss_clip": 1.02035892, + "balance_loss_mlp": 1.00303686, + "epoch": 0.9971742920699813, + "flos": 21470236237440.0, + "grad_norm": 2.20016908415848, + "language_loss": 0.7920506, + "learning_rate": 8.024834375608414e-11, + "loss": 0.81389725, + "num_input_tokens_seen": 178970500, + "step": 8293, + "time_per_iteration": 2.7650530338287354 + }, + { + "auxiliary_loss_clip": 0.01113012, + "auxiliary_loss_mlp": 0.01078844, + "balance_loss_clip": 1.01786411, + "balance_loss_mlp": 0.99987948, + "epoch": 0.9972945349606205, + "flos": 72211223629440.0, + "grad_norm": 0.8197012926519377, + "language_loss": 0.62884814, + "learning_rate": 7.342196686788149e-11, + "loss": 0.65076673, + "num_input_tokens_seen": 179023665, + "step": 8294, + "time_per_iteration": 3.172794818878174 + }, + { + "auxiliary_loss_clip": 0.0110445, + "auxiliary_loss_mlp": 0.01084526, + "balance_loss_clip": 1.02097821, + "balance_loss_mlp": 1.00441742, + "epoch": 0.9974147778512595, + "flos": 19678293515520.0, + "grad_norm": 1.9502806272531878, + "language_loss": 0.68226266, + "learning_rate": 6.689897725142834e-11, + "loss": 0.70415246, + "num_input_tokens_seen": 179043140, + "step": 8295, + "time_per_iteration": 2.7378311157226562 + }, + { + "auxiliary_loss_clip": 0.01115715, + "auxiliary_loss_mlp": 0.01083831, + "balance_loss_clip": 1.02435029, + "balance_loss_mlp": 1.00372279, + "epoch": 0.9975350207418986, + "flos": 15960821391360.0, + "grad_norm": 2.5195321161637256, + "language_loss": 0.88318896, + "learning_rate": 6.067937589615545e-11, + "loss": 0.90518439, + "num_input_tokens_seen": 179061215, + "step": 8296, + "time_per_iteration": 2.6504249572753906 + }, + { + "auxiliary_loss_clip": 0.01096618, + "auxiliary_loss_mlp": 0.01078974, + "balance_loss_clip": 1.01742435, + "balance_loss_mlp": 1.00000978, + "epoch": 0.9976552636325378, + "flos": 59961879768960.0, + "grad_norm": 0.7394890228313169, + "language_loss": 0.57672727, + "learning_rate": 5.476316374575241e-11, + "loss": 0.5984832, + "num_input_tokens_seen": 179124700, + "step": 8297, + "time_per_iteration": 3.316352128982544 + }, + { + "auxiliary_loss_clip": 0.01134769, + "auxiliary_loss_mlp": 0.01084243, + "balance_loss_clip": 1.02572966, + "balance_loss_mlp": 1.00408638, + "epoch": 0.9977755065231768, + "flos": 22487872452480.0, + "grad_norm": 2.2183609565068148, + "language_loss": 0.7284795, + "learning_rate": 4.9150341697723476e-11, + "loss": 0.7506696, + "num_input_tokens_seen": 179144590, + "step": 8298, + "time_per_iteration": 2.6953485012054443 + }, + { + "auxiliary_loss_clip": 0.01118708, + "auxiliary_loss_mlp": 0.01085839, + "balance_loss_clip": 1.02546346, + "balance_loss_mlp": 1.00573027, + "epoch": 0.9978957494138159, + "flos": 26030280666240.0, + "grad_norm": 1.53583453688147, + "language_loss": 0.66544127, + "learning_rate": 4.384091060338768e-11, + "loss": 0.68748677, + "num_input_tokens_seen": 179165060, + "step": 8299, + "time_per_iteration": 2.866100549697876 + }, + { + "auxiliary_loss_clip": 0.01124315, + "auxiliary_loss_mlp": 0.01082601, + "balance_loss_clip": 1.02447927, + "balance_loss_mlp": 1.0025878, + "epoch": 0.998015992304455, + "flos": 22637835734400.0, + "grad_norm": 2.163577375865013, + "language_loss": 0.73532289, + "learning_rate": 3.883487126810081e-11, + "loss": 0.75739211, + "num_input_tokens_seen": 179184320, + "step": 8300, + "time_per_iteration": 2.7441325187683105 + }, + { + "auxiliary_loss_clip": 0.01126628, + "auxiliary_loss_mlp": 0.01083508, + "balance_loss_clip": 1.02539396, + "balance_loss_mlp": 1.00354195, + "epoch": 0.9981362351950941, + "flos": 18223444955520.0, + "grad_norm": 1.5961440834809215, + "language_loss": 0.79121053, + "learning_rate": 3.41322244516995e-11, + "loss": 0.81331193, + "num_input_tokens_seen": 179202265, + "step": 8301, + "time_per_iteration": 3.6304352283477783 + }, + { + "auxiliary_loss_clip": 0.01091388, + "auxiliary_loss_mlp": 0.01083196, + "balance_loss_clip": 1.01903617, + "balance_loss_mlp": 1.00308728, + "epoch": 0.9982564780857331, + "flos": 33474095573760.0, + "grad_norm": 1.6288002873361833, + "language_loss": 0.62817824, + "learning_rate": 2.9732970866946925e-11, + "loss": 0.64992404, + "num_input_tokens_seen": 179222145, + "step": 8302, + "time_per_iteration": 2.9206812381744385 + }, + { + "auxiliary_loss_clip": 0.01107477, + "auxiliary_loss_mlp": 0.01083557, + "balance_loss_clip": 1.02393568, + "balance_loss_mlp": 1.00335336, + "epoch": 0.9983767209763723, + "flos": 15523465392000.0, + "grad_norm": 2.4803680813413345, + "language_loss": 0.77872425, + "learning_rate": 2.563711118175327e-11, + "loss": 0.80063462, + "num_input_tokens_seen": 179239030, + "step": 8303, + "time_per_iteration": 3.7704086303710938 + }, + { + "auxiliary_loss_clip": 0.01107022, + "auxiliary_loss_mlp": 0.01084899, + "balance_loss_clip": 1.02414608, + "balance_loss_mlp": 1.00483847, + "epoch": 0.9984969638670114, + "flos": 19974377324160.0, + "grad_norm": 1.6482283728080906, + "language_loss": 0.83557427, + "learning_rate": 2.184464601717728e-11, + "loss": 0.8574934, + "num_input_tokens_seen": 179257345, + "step": 8304, + "time_per_iteration": 2.8255040645599365 + }, + { + "auxiliary_loss_clip": 0.01126781, + "auxiliary_loss_mlp": 0.01084037, + "balance_loss_clip": 1.02632189, + "balance_loss_mlp": 1.00388086, + "epoch": 0.9986172067576504, + "flos": 20375750874240.0, + "grad_norm": 4.4885414430010595, + "language_loss": 0.77375484, + "learning_rate": 1.8355575948758585e-11, + "loss": 0.79586303, + "num_input_tokens_seen": 179275330, + "step": 8305, + "time_per_iteration": 3.6559102535247803 + }, + { + "auxiliary_loss_clip": 0.01118378, + "auxiliary_loss_mlp": 0.01084398, + "balance_loss_clip": 1.02503908, + "balance_loss_mlp": 1.00424135, + "epoch": 0.9987374496482896, + "flos": 23727903724800.0, + "grad_norm": 2.0137932692845713, + "language_loss": 0.7314474, + "learning_rate": 1.5169901505407424e-11, + "loss": 0.75347519, + "num_input_tokens_seen": 179292395, + "step": 8306, + "time_per_iteration": 2.738182306289673 + }, + { + "auxiliary_loss_clip": 0.0110801, + "auxiliary_loss_mlp": 0.01083352, + "balance_loss_clip": 1.02247834, + "balance_loss_mlp": 1.00338674, + "epoch": 0.9988576925389286, + "flos": 25044029959680.0, + "grad_norm": 1.8144201860694706, + "language_loss": 0.74197966, + "learning_rate": 1.228762317073695e-11, + "loss": 0.76389325, + "num_input_tokens_seen": 179311225, + "step": 8307, + "time_per_iteration": 3.6911256313323975 + }, + { + "auxiliary_loss_clip": 0.01111185, + "auxiliary_loss_mlp": 0.01084353, + "balance_loss_clip": 1.02090335, + "balance_loss_mlp": 1.00429177, + "epoch": 0.9989779354295677, + "flos": 31285627637760.0, + "grad_norm": 1.8195011894610018, + "language_loss": 0.79040438, + "learning_rate": 9.70874138195299e-12, + "loss": 0.81235981, + "num_input_tokens_seen": 179333135, + "step": 8308, + "time_per_iteration": 2.852707862854004 + }, + { + "auxiliary_loss_clip": 0.01134975, + "auxiliary_loss_mlp": 0.01084378, + "balance_loss_clip": 1.02527547, + "balance_loss_mlp": 1.00422168, + "epoch": 0.9990981783202069, + "flos": 19573398823680.0, + "grad_norm": 2.064854333815127, + "language_loss": 0.74660444, + "learning_rate": 7.433256530076093e-12, + "loss": 0.76879799, + "num_input_tokens_seen": 179353090, + "step": 8309, + "time_per_iteration": 2.652297258377075 + }, + { + "auxiliary_loss_clip": 0.01099044, + "auxiliary_loss_mlp": 0.01083863, + "balance_loss_clip": 1.02289367, + "balance_loss_mlp": 1.00389743, + "epoch": 0.9992184212108459, + "flos": 17199667514880.0, + "grad_norm": 3.329275257407019, + "language_loss": 0.76041043, + "learning_rate": 5.46116896038562e-12, + "loss": 0.78223956, + "num_input_tokens_seen": 179367500, + "step": 8310, + "time_per_iteration": 2.9220035076141357 + }, + { + "auxiliary_loss_clip": 0.01111548, + "auxiliary_loss_mlp": 0.01083688, + "balance_loss_clip": 1.02088571, + "balance_loss_mlp": 1.00362742, + "epoch": 0.999338664101485, + "flos": 46497853681920.0, + "grad_norm": 1.8551293898625312, + "language_loss": 0.61857438, + "learning_rate": 3.792478972197699e-12, + "loss": 0.64052671, + "num_input_tokens_seen": 179388085, + "step": 8311, + "time_per_iteration": 3.011188268661499 + }, + { + "auxiliary_loss_clip": 0.01134691, + "auxiliary_loss_mlp": 0.01083185, + "balance_loss_clip": 1.02531672, + "balance_loss_mlp": 1.00312412, + "epoch": 0.9994589069921241, + "flos": 15158253859200.0, + "grad_norm": 2.286912182495602, + "language_loss": 0.70500672, + "learning_rate": 2.4271868181990895e-12, + "loss": 0.72718549, + "num_input_tokens_seen": 179405250, + "step": 8312, + "time_per_iteration": 2.658392906188965 + }, + { + "auxiliary_loss_clip": 0.01125807, + "auxiliary_loss_mlp": 0.0108477, + "balance_loss_clip": 1.02500618, + "balance_loss_mlp": 1.00470948, + "epoch": 0.9995791498827632, + "flos": 12531460256640.0, + "grad_norm": 2.1919942064278084, + "language_loss": 0.80817151, + "learning_rate": 1.3652927060014973e-12, + "loss": 0.8302772, + "num_input_tokens_seen": 179420845, + "step": 8313, + "time_per_iteration": 2.677932024002075 + }, + { + "auxiliary_loss_clip": 0.01101703, + "auxiliary_loss_mlp": 0.01083401, + "balance_loss_clip": 1.02025056, + "balance_loss_mlp": 1.00324452, + "epoch": 0.9996993927734023, + "flos": 19245175320960.0, + "grad_norm": 1.8132999928301272, + "language_loss": 0.63568997, + "learning_rate": 6.067967965872612e-13, + "loss": 0.65754098, + "num_input_tokens_seen": 179440455, + "step": 8314, + "time_per_iteration": 2.715634346008301 + }, + { + "auxiliary_loss_clip": 0.01099172, + "auxiliary_loss_mlp": 0.01084096, + "balance_loss_clip": 1.02288461, + "balance_loss_mlp": 1.00403547, + "epoch": 0.9998196356640414, + "flos": 62952804518400.0, + "grad_norm": 1.54859471755541, + "language_loss": 0.77142745, + "learning_rate": 1.5169920497548615e-13, + "loss": 0.79326016, + "num_input_tokens_seen": 179465075, + "step": 8315, + "time_per_iteration": 3.137319564819336 + }, + { + "auxiliary_loss_clip": 0.01115313, + "auxiliary_loss_mlp": 0.01081324, + "balance_loss_clip": 1.02062607, + "balance_loss_mlp": 1.00164449, + "epoch": 0.9999398785546805, + "flos": 50922375073920.0, + "grad_norm": 1.0995392904925272, + "language_loss": 0.55085599, + "learning_rate": 0.0, + "loss": 0.57282233, + "num_input_tokens_seen": 179513955, + "step": 8316, + "time_per_iteration": 3.2136342525482178 + }, + { + "epoch": 0.9999398785546805, + "num_input_tokens_seen": 179513955, + "step": 8316, + "total_flos": 6.996749092776837e+17, + "train_loss": 0.7892374441264167, + "train_runtime": 25934.9365, + "train_samples_per_second": 12.826, + "train_steps_per_second": 0.321 + } + ], + "logging_steps": 1.0, + "max_steps": 8316, + "num_input_tokens_seen": 179513955, + "num_train_epochs": 1, + "save_steps": 1664, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6.996749092776837e+17, + "train_batch_size": 5, + "trial_name": null, + "trial_params": null +} diff --git a/CompeteSMoE/competesmoe_versions/Softplus_diversity_3loss_adv_norm_competesmoe/training_args.bin b/CompeteSMoE/competesmoe_versions/Softplus_diversity_3loss_adv_norm_competesmoe/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f63cb0929ce7b9c901cd4229f42776bf12492e0e --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Softplus_diversity_3loss_adv_norm_competesmoe/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee20b5f40e9a28d76a401e80be5e3f3e0a35f68604d61701d0b6fd1478c3bcfa +size 7992